Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 84%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from typing import Dict, Iterable, Optional, cast
5from pdfminer.glyphlist import glyphname2unicode
6from pdfminer.latin_enc import ENCODING
7from pdfminer.pdfexceptions import PDFKeyError
8from pdfminer.psparser import PSLiteral
10HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
12log = logging.getLogger(__name__)
15def name2unicode(name: str) -> str:
16 """Converts Adobe glyph names to Unicode numbers.
18 In contrast to the specification, this raises a KeyError instead of return
19 an empty string when the key is unknown.
20 This way the caller must explicitly define what to do
21 when there is not a match.
23 Reference:
24 https://github.com/adobe-type-tools/agl-specification#2-the-mapping
26 :returns unicode character if name resembles something,
27 otherwise a KeyError
28 """
29 if not isinstance(name, str):
30 raise PDFKeyError(
31 'Could not convert unicode name "%s" to character because '
32 "it should be of type str but is of type %s" % (name, type(name)),
33 )
35 name = name.split(".")[0]
36 components = name.split("_")
38 if len(components) > 1:
39 return "".join(map(name2unicode, components))
41 elif name in glyphname2unicode:
42 return glyphname2unicode[name]
44 elif name.startswith("uni"):
45 name_without_uni = name.strip("uni")
47 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
48 unicode_digits = [
49 int(name_without_uni[i : i + 4], base=16)
50 for i in range(0, len(name_without_uni), 4)
51 ]
52 for digit in unicode_digits:
53 raise_key_error_for_invalid_unicode(digit)
54 characters = map(chr, unicode_digits)
55 return "".join(characters)
57 elif name.startswith("u"):
58 name_without_u = name.strip("u")
60 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
61 unicode_digit = int(name_without_u, base=16)
62 raise_key_error_for_invalid_unicode(unicode_digit)
63 return chr(unicode_digit)
65 raise PDFKeyError(
66 'Could not convert unicode name "%s" to character because '
67 "it does not match specification" % name,
68 )
71def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
72 """Unicode values should not be in the range D800 through DFFF because
73 that is used for surrogate pairs in UTF-16
75 :raises KeyError if unicode digit is invalid
76 """
77 if 55295 < unicode_digit < 57344:
78 raise PDFKeyError(
79 "Unicode digit %d is invalid because "
80 "it is in the range D800 through DFFF" % unicode_digit,
81 )
84class EncodingDB:
85 std2unicode: Dict[int, str] = {}
86 mac2unicode: Dict[int, str] = {}
87 win2unicode: Dict[int, str] = {}
88 pdf2unicode: Dict[int, str] = {}
89 for name, std, mac, win, pdf in ENCODING:
90 c = name2unicode(name)
91 if std:
92 std2unicode[std] = c
93 if mac:
94 mac2unicode[mac] = c
95 if win:
96 win2unicode[win] = c
97 if pdf:
98 pdf2unicode[pdf] = c
100 encodings = {
101 "StandardEncoding": std2unicode,
102 "MacRomanEncoding": mac2unicode,
103 "WinAnsiEncoding": win2unicode,
104 "PDFDocEncoding": pdf2unicode,
105 }
107 @classmethod
108 def get_encoding(
109 cls,
110 name: str,
111 diff: Optional[Iterable[object]] = None,
112 ) -> Dict[int, str]:
113 cid2unicode = cls.encodings.get(name, cls.std2unicode)
114 if diff:
115 cid2unicode = cid2unicode.copy()
116 cid = 0
117 for x in diff:
118 if isinstance(x, int):
119 cid = x
120 elif isinstance(x, PSLiteral):
121 try:
122 cid2unicode[cid] = name2unicode(cast(str, x.name))
123 except (KeyError, ValueError) as e:
124 log.debug(str(e))
125 cid += 1
126 return cid2unicode