Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import binascii
2from binascii import unhexlify
3from math import ceil
4from typing import Any, Dict, List, Tuple, Union, cast
6from ._codecs import adobe_glyphs, charset_encoding
7from ._utils import logger_error, logger_warning
8from .generic import (
9 ArrayObject,
10 DecodedStreamObject,
11 DictionaryObject,
12 NullObject,
13 StreamObject,
14 is_null_or_none,
15)
18# code freely inspired from @twiggy ; see #711
19def build_char_map(
20 font_name: str, space_width: float, obj: DictionaryObject
21) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
22 """
23 Determine information about a font.
25 Args:
26 font_name: font name as a string
27 space_width: default space width if no data is found.
28 obj: XObject or Page where you can find a /Resource dictionary
30 Returns:
31 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
32 The font-dictionary itself is suitable for the curious.
34 """
35 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
36 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
37 space_width, ft
38 )
39 return font_subtype, font_halfspace, font_encoding, font_map, ft
42def build_char_map_from_dict(
43 space_width: float, ft: DictionaryObject
44) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
45 """
46 Determine information about a font.
48 Args:
49 space_width: default space with if no data found
50 (normally half the width of a character).
51 ft: Font Dictionary
53 Returns:
54 Font sub-type, space_width criteria(50% of width), encoding, map character-map.
55 The font-dictionary itself is suitable for the curious.
57 """
58 font_type = cast(str, ft["/Subtype"].get_object())
59 encoding, map_dict = get_encoding(ft)
61 space_key_char = get_actual_str_key(" ", encoding, map_dict)
62 font_width_map = build_font_width_map(ft, space_width * 2.0)
63 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
65 return (
66 font_type,
67 half_space_width,
68 encoding,
69 # https://github.com/python/mypy/issues/4374
70 map_dict
71 )
74# used when missing data, e.g. font def missing
75unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
76 "Unknown",
77 9999,
78 dict.fromkeys(range(256), "�"),
79 {},
80)
83_predefined_cmap: Dict[str, str] = {
84 "/Identity-H": "utf-16-be",
85 "/Identity-V": "utf-16-be",
86 "/GB-EUC-H": "gbk",
87 "/GB-EUC-V": "gbk",
88 "/GBpc-EUC-H": "gb2312",
89 "/GBpc-EUC-V": "gb2312",
90 "/GBK-EUC-H": "gbk",
91 "/GBK-EUC-V": "gbk",
92 "/GBK2K-H": "gb18030",
93 "/GBK2K-V": "gb18030",
94 "/ETen-B5-H": "cp950",
95 "/ETen-B5-V": "cp950",
96 "/ETenms-B5-H": "cp950",
97 "/ETenms-B5-V": "cp950",
98 "/UniCNS-UTF16-H": "utf-16-be",
99 "/UniCNS-UTF16-V": "utf-16-be",
100 "/UniGB-UTF16-H": "gb18030",
101 "/UniGB-UTF16-V": "gb18030",
102 # UCS2 in code
103}
105# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
106_default_fonts_space_width: Dict[str, int] = {
107 "/Courier": 600,
108 "/Courier-Bold": 600,
109 "/Courier-BoldOblique": 600,
110 "/Courier-Oblique": 600,
111 "/Helvetica": 278,
112 "/Helvetica-Bold": 278,
113 "/Helvetica-BoldOblique": 278,
114 "/Helvetica-Oblique": 278,
115 "/Helvetica-Narrow": 228,
116 "/Helvetica-NarrowBold": 228,
117 "/Helvetica-NarrowBoldOblique": 228,
118 "/Helvetica-NarrowOblique": 228,
119 "/Times-Roman": 250,
120 "/Times-Bold": 250,
121 "/Times-BoldItalic": 250,
122 "/Times-Italic": 250,
123 "/Symbol": 250,
124 "/ZapfDingbats": 278,
125}
128def get_encoding(
129 ft: DictionaryObject
130) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
131 encoding = _parse_encoding(ft)
132 map_dict, int_entry = _parse_to_unicode(ft)
134 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
135 # if cmap not empty encoding should be discarded
136 # (here transformed into identity for those characters)
137 # If encoding is a string, it is expected to be an identity translation.
138 if isinstance(encoding, dict):
139 for x in int_entry:
140 if x <= 255:
141 encoding[x] = chr(x)
143 return encoding, map_dict
146def _parse_encoding(
147 ft: DictionaryObject
148) -> Union[str, Dict[int, str]]:
149 encoding: Union[str, List[str], Dict[int, str]] = []
150 if "/Encoding" not in ft:
151 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
152 encoding = dict(
153 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
154 )
155 else:
156 encoding = "charmap"
157 return encoding
158 enc: Union[str, DictionaryObject, NullObject] = cast(
159 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
160 )
161 if isinstance(enc, str):
162 try:
163 # already done : enc = NameObject.unnumber(enc.encode()).decode()
164 # for #xx decoding
165 if enc in charset_encoding:
166 encoding = charset_encoding[enc].copy()
167 elif enc in _predefined_cmap:
168 encoding = _predefined_cmap[enc]
169 elif "-UCS2-" in enc:
170 encoding = "utf-16-be"
171 else:
172 raise Exception("not found")
173 except Exception:
174 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
175 encoding = enc
176 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
177 try:
178 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
179 except Exception:
180 logger_error(
181 f"Advanced encoding {encoding} not implemented yet",
182 __name__,
183 )
184 encoding = charset_encoding["/StandardEncoding"].copy()
185 else:
186 encoding = charset_encoding["/StandardEncoding"].copy()
187 if isinstance(enc, DictionaryObject) and "/Differences" in enc:
188 x: int = 0
189 o: Union[int, str]
190 for o in cast(DictionaryObject, enc["/Differences"]):
191 if isinstance(o, int):
192 x = o
193 else: # isinstance(o, str):
194 try:
195 if x < len(encoding):
196 encoding[x] = adobe_glyphs[o] # type: ignore
197 except Exception:
198 encoding[x] = o # type: ignore
199 x += 1
200 if isinstance(encoding, list):
201 encoding = dict(zip(range(256), encoding))
202 return encoding
205def _parse_to_unicode(
206 ft: DictionaryObject
207) -> Tuple[Dict[Any, Any], List[int]]:
208 # will store all translation code
209 # and map_dict[-1] we will have the number of bytes to convert
210 map_dict: Dict[Any, Any] = {}
212 # will provide the list of cmap keys as int to correct encoding
213 int_entry: List[int] = []
215 if "/ToUnicode" not in ft:
216 if ft.get("/Subtype", "") == "/Type1":
217 return _type1_alternative(ft, map_dict, int_entry)
218 return {}, []
219 process_rg: bool = False
220 process_char: bool = False
221 multiline_rg: Union[
222 None, Tuple[int, int]
223 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
224 cm = prepare_cm(ft)
225 for line in cm.split(b"\n"):
226 process_rg, process_char, multiline_rg = process_cm_line(
227 line.strip(b" \t"),
228 process_rg,
229 process_char,
230 multiline_rg,
231 map_dict,
232 int_entry,
233 )
235 return map_dict, int_entry
238def get_actual_str_key(
239 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
240) -> str:
241 key_dict = {}
242 if isinstance(encoding, dict):
243 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
244 else:
245 key_dict = {value: key for key, value in map_dict.items() if value == value_char}
246 return key_dict.get(value_char, value_char)
249def prepare_cm(ft: DictionaryObject) -> bytes:
250 tu = ft["/ToUnicode"]
251 cm: bytes
252 if isinstance(tu, StreamObject):
253 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
254 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
255 # the full range 0000-FFFF will be processed
256 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
257 if isinstance(cm, str):
258 cm = cm.encode()
259 # we need to prepare cm before due to missing return line in pdf printed
260 # to pdf from word
261 cm = (
262 cm.strip()
263 .replace(b"beginbfchar", b"\nbeginbfchar\n")
264 .replace(b"endbfchar", b"\nendbfchar\n")
265 .replace(b"beginbfrange", b"\nbeginbfrange\n")
266 .replace(b"endbfrange", b"\nendbfrange\n")
267 .replace(b"<<", b"\n{\n") # text between << and >> not used but
268 .replace(b">>", b"\n}\n") # some solution to find it back
269 )
270 ll = cm.split(b"<")
271 for i in range(len(ll)):
272 j = ll[i].find(b">")
273 if j >= 0:
274 if j == 0:
275 # string is empty: stash a placeholder here (see below)
276 # see https://github.com/py-pdf/pypdf/issues/1111
277 content = b"."
278 else:
279 content = ll[i][:j].replace(b" ", b"")
280 ll[i] = content + b" " + ll[i][j + 1 :]
281 cm = (
282 (b" ".join(ll))
283 .replace(b"[", b" [ ")
284 .replace(b"]", b" ]\n ")
285 .replace(b"\r", b"\n")
286 )
287 return cm
290def process_cm_line(
291 line: bytes,
292 process_rg: bool,
293 process_char: bool,
294 multiline_rg: Union[None, Tuple[int, int]],
295 map_dict: Dict[Any, Any],
296 int_entry: List[int],
297) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
298 if line == b"" or line[0] == 37: # 37 = %
299 return process_rg, process_char, multiline_rg
300 line = line.replace(b"\t", b" ")
301 if b"beginbfrange" in line:
302 process_rg = True
303 elif b"endbfrange" in line:
304 process_rg = False
305 elif b"beginbfchar" in line:
306 process_char = True
307 elif b"endbfchar" in line:
308 process_char = False
309 elif process_rg:
310 try:
311 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
312 except binascii.Error as error:
313 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
314 elif process_char:
315 parse_bfchar(line, map_dict, int_entry)
316 return process_rg, process_char, multiline_rg
319def parse_bfrange(
320 line: bytes,
321 map_dict: Dict[Any, Any],
322 int_entry: List[int],
323 multiline_rg: Union[None, Tuple[int, int]],
324) -> Union[None, Tuple[int, int]]:
325 lst = [x for x in line.split(b" ") if x]
326 closure_found = False
327 if multiline_rg is not None:
328 fmt = b"%%0%dX" % (map_dict[-1] * 2)
329 a = multiline_rg[0] # a, b not in the current line
330 b = multiline_rg[1]
331 for sq in lst:
332 if sq == b"]":
333 closure_found = True
334 break
335 map_dict[
336 unhexlify(fmt % a).decode(
337 "charmap" if map_dict[-1] == 1 else "utf-16-be",
338 "surrogatepass",
339 )
340 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
341 int_entry.append(a)
342 a += 1
343 else:
344 a = int(lst[0], 16)
345 b = int(lst[1], 16)
346 nbi = max(len(lst[0]), len(lst[1]))
347 map_dict[-1] = ceil(nbi / 2)
348 fmt = b"%%0%dX" % (map_dict[-1] * 2)
349 if lst[2] == b"[":
350 for sq in lst[3:]:
351 if sq == b"]":
352 closure_found = True
353 break
354 map_dict[
355 unhexlify(fmt % a).decode(
356 "charmap" if map_dict[-1] == 1 else "utf-16-be",
357 "surrogatepass",
358 )
359 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
360 int_entry.append(a)
361 a += 1
362 else: # case without list
363 c = int(lst[2], 16)
364 fmt2 = b"%%0%dX" % max(4, len(lst[2]))
365 closure_found = True
366 while a <= b:
367 map_dict[
368 unhexlify(fmt % a).decode(
369 "charmap" if map_dict[-1] == 1 else "utf-16-be",
370 "surrogatepass",
371 )
372 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
373 int_entry.append(a)
374 a += 1
375 c += 1
376 return None if closure_found else (a, b)
379def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
380 lst = [x for x in line.split(b" ") if x]
381 map_dict[-1] = len(lst[0]) // 2
382 while len(lst) > 1:
383 map_to = ""
384 # placeholder (see above) means empty string
385 if lst[1] != b".":
386 map_to = unhexlify(lst[1]).decode(
387 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
388 ) # join is here as some cases where the code was split
389 map_dict[
390 unhexlify(lst[0]).decode(
391 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
392 )
393 ] = map_to
394 int_entry.append(int(lst[0], 16))
395 lst = lst[2:]
398def build_font_width_map(
399 ft: DictionaryObject, default_font_width: float
400) -> Dict[Any, float]:
401 font_width_map: Dict[Any, float] = {}
402 st: int = 0
403 en: int = 0
404 try:
405 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0
406 except KeyError:
407 pass
408 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
409 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
410 # Widths for a CIDFont are defined using the DW and W entries.
411 # DW2 and W2 are for vertical use. Vertical type is not implemented.
412 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
413 if "/DW" in ft1:
414 font_width_map["default"] = cast(float, ft1["/DW"].get_object())
415 else:
416 font_width_map["default"] = default_font_width
417 if "/W" in ft1:
418 w = ft1["/W"].get_object()
419 else:
420 w = []
421 while len(w) > 0:
422 st = w[0] if isinstance(w[0], int) else w[0].get_object()
423 second = w[1].get_object()
424 if isinstance(second, int):
425 # C_first C_last same_W
426 en = second
427 width = w[2].get_object()
428 if not isinstance(width, (int, float)):
429 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
430 w = w[3:]
431 continue
432 for c_code in range(st, en + 1):
433 font_width_map[chr(c_code)] = width
434 w = w[3:]
435 elif isinstance(second, list):
436 # Starting_C [W1 W2 ... Wn]
437 c_code = st
438 for ww in second:
439 width = ww.get_object()
440 font_width_map[chr(c_code)] = width
441 c_code += 1
442 w = w[2:]
443 else:
444 logger_warning(
445 "unknown widths : \n" + (ft1["/W"]).__repr__(),
446 __name__,
447 )
448 break
449 elif "/Widths" in ft:
450 w = cast(ArrayObject, ft["/Widths"].get_object())
451 if "/FontDescriptor" in ft and "/MissingWidth" in cast(
452 DictionaryObject, ft["/FontDescriptor"]
453 ):
454 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
455 else:
456 # will consider width of char as avg(width)
457 m = 0
458 cpt = 0
459 for xx in w:
460 xx = xx.get_object()
461 if xx > 0:
462 m += xx
463 cpt += 1
464 font_width_map["default"] = m / max(1, cpt)
465 st = cast(int, ft["/FirstChar"])
466 en = cast(int, ft["/LastChar"])
467 for c_code in range(st, en + 1):
468 try:
469 width = w[c_code - st].get_object()
470 font_width_map[chr(c_code)] = width
471 except (IndexError, KeyError):
472 # The PDF structure is invalid. The array is too small
473 # for the specified font width.
474 pass
475 if is_null_or_none(font_width_map.get("default")):
476 font_width_map["default"] = default_font_width if default_font_width else 0.0
477 return font_width_map
480def compute_space_width(
481 font_width_map: Dict[Any, float], space_char: str
482) -> float:
483 try:
484 sp_width = font_width_map[space_char]
485 if sp_width == 0:
486 raise ValueError("Zero width")
487 except (KeyError, ValueError):
488 sp_width = (
489 font_width_map["default"] / 2.0
490 ) # if using default we consider space will be only half size
492 return sp_width
495def compute_font_width(
496 font_width_map: Dict[Any, float],
497 char: str
498) -> float:
499 char_width: float = 0.0
500 try:
501 char_width = font_width_map[char]
502 except KeyError:
503 char_width = (
504 font_width_map["default"]
505 )
507 return char_width
510def _type1_alternative(
511 ft: DictionaryObject,
512 map_dict: Dict[Any, Any],
513 int_entry: List[int],
514) -> Tuple[Dict[Any, Any], List[int]]:
515 if "/FontDescriptor" not in ft:
516 return map_dict, int_entry
517 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
518 if is_null_or_none(ft_desc):
519 return map_dict, int_entry
520 assert ft_desc is not None, "mypy"
521 txt = ft_desc.get_object().get_data()
522 txt = txt.split(b"eexec\n")[0] # only clear part
523 txt = txt.split(b"/Encoding")[1] # to get the encoding part
524 lines = txt.replace(b"\r", b"\n").split(b"\n")
525 for li in lines:
526 if li.startswith(b"dup"):
527 words = [_w for _w in li.split(b" ") if _w != b""]
528 if len(words) > 3 and words[3] != b"put":
529 continue
530 try:
531 i = int(words[1])
532 except ValueError: # pragma: no cover
533 continue
534 try:
535 v = adobe_glyphs[words[2].decode()]
536 except KeyError:
537 if words[2].startswith(b"/uni"):
538 try:
539 v = chr(int(words[2][4:], 16))
540 except ValueError: # pragma: no cover
541 continue
542 else:
543 continue
544 map_dict[chr(i)] = v
545 int_entry.append(i)
546 return map_dict, int_entry