Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import Error as BinasciiError
3from binascii import unhexlify
4from math import ceil
5from typing import Any, Dict, List, Tuple, Union, cast
7from ._codecs import adobe_glyphs, charset_encoding
8from ._utils import logger_error, logger_warning
9from .generic import (
10 ArrayObject,
11 DecodedStreamObject,
12 DictionaryObject,
13 NullObject,
14 StreamObject,
15 is_null_or_none,
16)
19# code freely inspired from @twiggy ; see #711
20def build_char_map(
21 font_name: str, space_width: float, obj: DictionaryObject
22) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
23 """
24 Determine information about a font.
26 Args:
27 font_name: font name as a string
28 space_width: default space width if no data is found.
29 obj: XObject or Page where you can find a /Resource dictionary
31 Returns:
32 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
33 The font-dictionary itself is suitable for the curious.
35 """
36 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
37 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
38 space_width, ft
39 )
40 return font_subtype, font_halfspace, font_encoding, font_map, ft
43def build_char_map_from_dict(
44 space_width: float, ft: DictionaryObject
45) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
46 """
47 Determine information about a font.
49 Args:
50 space_width: default space with if no data found
51 (normally half the width of a character).
52 ft: Font Dictionary
54 Returns:
55 Font sub-type, space_width criteria(50% of width), encoding, map character-map.
56 The font-dictionary itself is suitable for the curious.
58 """
59 font_type = cast(str, ft["/Subtype"].get_object())
60 encoding, map_dict = get_encoding(ft)
62 space_key_char = get_actual_str_key(" ", encoding, map_dict)
63 font_width_map = build_font_width_map(ft, space_width * 2.0)
64 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
66 return (
67 font_type,
68 half_space_width,
69 encoding,
70 # https://github.com/python/mypy/issues/4374
71 map_dict
72 )
75# used when missing data, e.g. font def missing
76unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
77 "Unknown",
78 9999,
79 dict.fromkeys(range(256), "�"),
80 {},
81)
84_predefined_cmap: Dict[str, str] = {
85 "/Identity-H": "utf-16-be",
86 "/Identity-V": "utf-16-be",
87 "/GB-EUC-H": "gbk",
88 "/GB-EUC-V": "gbk",
89 "/GBpc-EUC-H": "gb2312",
90 "/GBpc-EUC-V": "gb2312",
91 "/GBK-EUC-H": "gbk",
92 "/GBK-EUC-V": "gbk",
93 "/GBK2K-H": "gb18030",
94 "/GBK2K-V": "gb18030",
95 "/ETen-B5-H": "cp950",
96 "/ETen-B5-V": "cp950",
97 "/ETenms-B5-H": "cp950",
98 "/ETenms-B5-V": "cp950",
99 "/UniCNS-UTF16-H": "utf-16-be",
100 "/UniCNS-UTF16-V": "utf-16-be",
101 "/UniGB-UTF16-H": "gb18030",
102 "/UniGB-UTF16-V": "gb18030",
103 # UCS2 in code
104}
106# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
107_default_fonts_space_width: Dict[str, int] = {
108 "/Courier": 600,
109 "/Courier-Bold": 600,
110 "/Courier-BoldOblique": 600,
111 "/Courier-Oblique": 600,
112 "/Helvetica": 278,
113 "/Helvetica-Bold": 278,
114 "/Helvetica-BoldOblique": 278,
115 "/Helvetica-Oblique": 278,
116 "/Helvetica-Narrow": 228,
117 "/Helvetica-NarrowBold": 228,
118 "/Helvetica-NarrowBoldOblique": 228,
119 "/Helvetica-NarrowOblique": 228,
120 "/Times-Roman": 250,
121 "/Times-Bold": 250,
122 "/Times-BoldItalic": 250,
123 "/Times-Italic": 250,
124 "/Symbol": 250,
125 "/ZapfDingbats": 278,
126}
129def get_encoding(
130 ft: DictionaryObject
131) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
132 encoding = _parse_encoding(ft)
133 map_dict, int_entry = _parse_to_unicode(ft)
135 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
136 # if cmap not empty encoding should be discarded
137 # (here transformed into identity for those characters)
138 # If encoding is a string, it is expected to be an identity translation.
139 if isinstance(encoding, dict):
140 for x in int_entry:
141 if x <= 255:
142 encoding[x] = chr(x)
144 return encoding, map_dict
147def _parse_encoding(
148 ft: DictionaryObject
149) -> Union[str, Dict[int, str]]:
150 encoding: Union[str, List[str], Dict[int, str]] = []
151 if "/Encoding" not in ft:
152 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
153 encoding = dict(
154 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
155 )
156 else:
157 encoding = "charmap"
158 return encoding
159 enc: Union[str, DictionaryObject, NullObject] = cast(
160 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
161 )
162 if isinstance(enc, str):
163 try:
164 # already done : enc = NameObject.unnumber(enc.encode()).decode()
165 # for #xx decoding
166 if enc in charset_encoding:
167 encoding = charset_encoding[enc].copy()
168 elif enc in _predefined_cmap:
169 encoding = _predefined_cmap[enc]
170 elif "-UCS2-" in enc:
171 encoding = "utf-16-be"
172 else:
173 raise Exception("not found")
174 except Exception:
175 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
176 encoding = enc
177 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
178 try:
179 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
180 except Exception:
181 logger_error(
182 f"Advanced encoding {encoding} not implemented yet",
183 __name__,
184 )
185 encoding = charset_encoding["/StandardEncoding"].copy()
186 else:
187 encoding = charset_encoding["/StandardEncoding"].copy()
188 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
189 x: int = 0
190 o: Union[int, str]
191 for o in cast(DictionaryObject, enc["/Differences"]):
192 if isinstance(o, int):
193 x = o
194 else: # isinstance(o, str):
195 try:
196 if x < len(encoding):
197 encoding[x] = adobe_glyphs[o] # type: ignore
198 except Exception:
199 encoding[x] = o # type: ignore
200 x += 1
201 if isinstance(encoding, list):
202 encoding = dict(zip(range(256), encoding))
203 return encoding
206def _parse_to_unicode(
207 ft: DictionaryObject
208) -> Tuple[Dict[Any, Any], List[int]]:
209 # will store all translation code
210 # and map_dict[-1] we will have the number of bytes to convert
211 map_dict: Dict[Any, Any] = {}
213 # will provide the list of cmap keys as int to correct encoding
214 int_entry: List[int] = []
216 if "/ToUnicode" not in ft:
217 if ft.get("/Subtype", "") == "/Type1":
218 return _type1_alternative(ft, map_dict, int_entry)
219 return {}, []
220 process_rg: bool = False
221 process_char: bool = False
222 multiline_rg: Union[
223 None, Tuple[int, int]
224 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
225 cm = prepare_cm(ft)
226 for line in cm.split(b"\n"):
227 process_rg, process_char, multiline_rg = process_cm_line(
228 line.strip(b" \t"),
229 process_rg,
230 process_char,
231 multiline_rg,
232 map_dict,
233 int_entry,
234 )
236 return map_dict, int_entry
239def get_actual_str_key(
240 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
241) -> str:
242 key_dict = {}
243 if isinstance(encoding, dict):
244 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
245 else:
246 key_dict = {value: key for key, value in map_dict.items() if value == value_char}
247 return key_dict.get(value_char, value_char)
250def prepare_cm(ft: DictionaryObject) -> bytes:
251 tu = ft["/ToUnicode"]
252 cm: bytes
253 if isinstance(tu, StreamObject):
254 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
255 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
256 # the full range 0000-FFFF will be processed
257 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
258 if isinstance(cm, str):
259 cm = cm.encode()
260 # we need to prepare cm before due to missing return line in pdf printed
261 # to pdf from word
262 cm = (
263 cm.strip()
264 .replace(b"beginbfchar", b"\nbeginbfchar\n")
265 .replace(b"endbfchar", b"\nendbfchar\n")
266 .replace(b"beginbfrange", b"\nbeginbfrange\n")
267 .replace(b"endbfrange", b"\nendbfrange\n")
268 .replace(b"<<", b"\n{\n") # text between << and >> not used but
269 .replace(b">>", b"\n}\n") # some solution to find it back
270 )
271 ll = cm.split(b"<")
272 for i in range(len(ll)):
273 j = ll[i].find(b">")
274 if j >= 0:
275 if j == 0:
276 # string is empty: stash a placeholder here (see below)
277 # see https://github.com/py-pdf/pypdf/issues/1111
278 content = b"."
279 else:
280 content = ll[i][:j].replace(b" ", b"")
281 ll[i] = content + b" " + ll[i][j + 1 :]
282 cm = (
283 (b" ".join(ll))
284 .replace(b"[", b" [ ")
285 .replace(b"]", b" ]\n ")
286 .replace(b"\r", b"\n")
287 )
288 return cm
291def process_cm_line(
292 line: bytes,
293 process_rg: bool,
294 process_char: bool,
295 multiline_rg: Union[None, Tuple[int, int]],
296 map_dict: Dict[Any, Any],
297 int_entry: List[int],
298) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
299 if line == b"" or line[0] == 37: # 37 = %
300 return process_rg, process_char, multiline_rg
301 line = line.replace(b"\t", b" ")
302 if b"beginbfrange" in line:
303 process_rg = True
304 elif b"endbfrange" in line:
305 process_rg = False
306 elif b"beginbfchar" in line:
307 process_char = True
308 elif b"endbfchar" in line:
309 process_char = False
310 elif process_rg:
311 try:
312 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
313 except binascii.Error as error:
314 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
315 elif process_char:
316 parse_bfchar(line, map_dict, int_entry)
317 return process_rg, process_char, multiline_rg
320def parse_bfrange(
321 line: bytes,
322 map_dict: Dict[Any, Any],
323 int_entry: List[int],
324 multiline_rg: Union[None, Tuple[int, int]],
325) -> Union[None, Tuple[int, int]]:
326 lst = [x for x in line.split(b" ") if x]
327 closure_found = False
328 if multiline_rg is not None:
329 fmt = b"%%0%dX" % (map_dict[-1] * 2)
330 a = multiline_rg[0] # a, b not in the current line
331 b = multiline_rg[1]
332 for sq in lst:
333 if sq == b"]":
334 closure_found = True
335 break
336 map_dict[
337 unhexlify(fmt % a).decode(
338 "charmap" if map_dict[-1] == 1 else "utf-16-be",
339 "surrogatepass",
340 )
341 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
342 int_entry.append(a)
343 a += 1
344 else:
345 a = int(lst[0], 16)
346 b = int(lst[1], 16)
347 nbi = max(len(lst[0]), len(lst[1]))
348 map_dict[-1] = ceil(nbi / 2)
349 fmt = b"%%0%dX" % (map_dict[-1] * 2)
350 if lst[2] == b"[":
351 for sq in lst[3:]:
352 if sq == b"]":
353 closure_found = True
354 break
355 map_dict[
356 unhexlify(fmt % a).decode(
357 "charmap" if map_dict[-1] == 1 else "utf-16-be",
358 "surrogatepass",
359 )
360 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
361 int_entry.append(a)
362 a += 1
363 else: # case without list
364 c = int(lst[2], 16)
365 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
366 closure_found = True
367 while a <= b:
368 map_dict[
369 unhexlify(fmt % a).decode(
370 "charmap" if map_dict[-1] == 1 else "utf-16-be",
371 "surrogatepass",
372 )
373 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
374 int_entry.append(a)
375 a += 1
376 c += 1
377 return None if closure_found else (a, b)
380def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
381 lst = [x for x in line.split(b" ") if x]
382 map_dict[-1] = len(lst[0]) // 2
383 while len(lst) > 1:
384 map_to = ""
385 # placeholder (see above) means empty string
386 if lst[1] != b".":
387 try:
388 map_to = unhexlify(lst[1]).decode(
389 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
390 ) # join is here as some cases where the code was split
391 except BinasciiError as exception:
392 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
393 map_dict[
394 unhexlify(lst[0]).decode(
395 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
396 )
397 ] = map_to
398 int_entry.append(int(lst[0], 16))
399 lst = lst[2:]
402def build_font_width_map(
403 ft: DictionaryObject, default_font_width: float
404) -> Dict[Any, float]:
405 font_width_map: Dict[Any, float] = {}
406 st: int = 0
407 en: int = 0
408 try:
409 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0
410 except KeyError:
411 pass
412 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
413 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
414 # Widths for a CIDFont are defined using the DW and W entries.
415 # DW2 and W2 are for vertical use. Vertical type is not implemented.
416 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
417 if "/DW" in ft1:
418 font_width_map["default"] = cast(float, ft1["/DW"].get_object())
419 else:
420 font_width_map["default"] = default_font_width
421 if "/W" in ft1:
422 w = ft1["/W"].get_object()
423 else:
424 w = []
425 while len(w) > 0:
426 st = w[0] if isinstance(w[0], int) else w[0].get_object()
427 second = w[1].get_object()
428 if isinstance(second, int):
429 # C_first C_last same_W
430 en = second
431 width = w[2].get_object()
432 if not isinstance(width, (int, float)):
433 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
434 w = w[3:]
435 continue
436 for c_code in range(st, en + 1):
437 font_width_map[chr(c_code)] = width
438 w = w[3:]
439 elif isinstance(second, list):
440 # Starting_C [W1 W2 ... Wn]
441 c_code = st
442 for ww in second:
443 width = ww.get_object()
444 font_width_map[chr(c_code)] = width
445 c_code += 1
446 w = w[2:]
447 else:
448 logger_warning(
449 "unknown widths : \n" + (ft1["/W"]).__repr__(),
450 __name__,
451 )
452 break
453 elif "/Widths" in ft:
454 w = cast(ArrayObject, ft["/Widths"].get_object())
455 if "/FontDescriptor" in ft and "/MissingWidth" in cast(
456 DictionaryObject, ft["/FontDescriptor"]
457 ):
458 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
459 else:
460 # will consider width of char as avg(width)
461 m = 0
462 cpt = 0
463 for xx in w:
464 xx = xx.get_object()
465 if xx > 0:
466 m += xx
467 cpt += 1
468 font_width_map["default"] = m / max(1, cpt)
469 st = cast(int, ft["/FirstChar"])
470 en = cast(int, ft["/LastChar"])
471 for c_code in range(st, en + 1):
472 try:
473 width = w[c_code - st].get_object()
474 font_width_map[chr(c_code)] = width
475 except (IndexError, KeyError):
476 # The PDF structure is invalid. The array is too small
477 # for the specified font width.
478 pass
479 if is_null_or_none(font_width_map.get("default")):
480 font_width_map["default"] = default_font_width if default_font_width else 0.0
481 return font_width_map
484def compute_space_width(
485 font_width_map: Dict[Any, float], space_char: str
486) -> float:
487 try:
488 sp_width = font_width_map[space_char]
489 if sp_width == 0:
490 raise ValueError("Zero width")
491 except (KeyError, ValueError):
492 sp_width = (
493 font_width_map["default"] / 2.0
494 ) # if using default we consider space will be only half size
496 return sp_width
499def compute_font_width(
500 font_width_map: Dict[Any, float],
501 char: str
502) -> float:
503 char_width: float = 0.0
504 try:
505 char_width = font_width_map[char]
506 except KeyError:
507 char_width = (
508 font_width_map["default"]
509 )
511 return char_width
514def _type1_alternative(
515 ft: DictionaryObject,
516 map_dict: Dict[Any, Any],
517 int_entry: List[int],
518) -> Tuple[Dict[Any, Any], List[int]]:
519 if "/FontDescriptor" not in ft:
520 return map_dict, int_entry
521 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
522 if is_null_or_none(ft_desc):
523 return map_dict, int_entry
524 assert ft_desc is not None, "mypy"
525 txt = ft_desc.get_object().get_data()
526 txt = txt.split(b"eexec\n")[0] # only clear part
527 txt = txt.split(b"/Encoding")[1] # to get the encoding part
528 lines = txt.replace(b"\r", b"\n").split(b"\n")
529 for li in lines:
530 if li.startswith(b"dup"):
531 words = [_w for _w in li.split(b" ") if _w != b""]
532 if len(words) > 3 and words[3] != b"put":
533 continue
534 try:
535 i = int(words[1])
536 except ValueError: # pragma: no cover
537 continue
538 try:
539 v = adobe_glyphs[words[2].decode()]
540 except KeyError:
541 if words[2].startswith(b"/uni"):
542 try:
543 v = chr(int(words[2][4:], 16))
544 except ValueError: # pragma: no cover
545 continue
546 else:
547 continue
548 map_dict[chr(i)] = v
549 int_entry.append(i)
550 return map_dict, int_entry