Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 80%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from collections.abc import Iterable
4from typing import ClassVar, cast
6from pdfminer.glyphlist import glyphname2unicode
7from pdfminer.latin_enc import ENCODING
8from pdfminer.pdfexceptions import PDFKeyError
9from pdfminer.psparser import PSLiteral
11HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
13log = logging.getLogger(__name__)
16def name2unicode(name: str) -> str:
17 """Converts Adobe glyph names to Unicode numbers.
19 In contrast to the specification, this raises a KeyError instead of return
20 an empty string when the key is unknown.
21 This way the caller must explicitly define what to do
22 when there is not a match.
24 Reference:
25 https://github.com/adobe-type-tools/agl-specification#2-the-mapping
27 :returns unicode character if name resembles something,
28 otherwise a KeyError
29 """
30 if not isinstance(name, str):
31 raise PDFKeyError(
32 f'Could not convert unicode name "{name}" to character because '
33 f"it should be of type str but is of type {type(name)}",
34 )
36 name = name.split(".")[0]
37 components = name.split("_")
39 if len(components) > 1:
40 return "".join(map(name2unicode, components))
42 elif name in glyphname2unicode:
43 return glyphname2unicode[name]
45 elif name.startswith("uni"):
46 name_without_uni = name.strip("uni")
48 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
49 unicode_digits = [
50 int(name_without_uni[i : i + 4], base=16)
51 for i in range(0, len(name_without_uni), 4)
52 ]
53 for digit in unicode_digits:
54 raise_key_error_for_invalid_unicode(digit)
55 characters = map(chr, unicode_digits)
56 return "".join(characters)
58 elif name.startswith("u"):
59 name_without_u = name.strip("u")
61 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
62 unicode_digit = int(name_without_u, base=16)
63 raise_key_error_for_invalid_unicode(unicode_digit)
64 return chr(unicode_digit)
66 raise PDFKeyError(
67 f'Could not convert unicode name "{name}" to character because '
68 "it does not match specification",
69 )
72def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
73 """Unicode values should not be in the range D800 through DFFF because
74 that is used for surrogate pairs in UTF-16
76 :raises KeyError if unicode digit is invalid
77 """
78 if 55295 < unicode_digit < 57344:
79 raise PDFKeyError(
80 f"Unicode digit {unicode_digit} is invalid because "
81 "it is in the range D800 through DFFF",
82 )
85class EncodingDB:
86 std2unicode: ClassVar[dict[int, str]] = {}
87 mac2unicode: ClassVar[dict[int, str]] = {}
88 win2unicode: ClassVar[dict[int, str]] = {}
89 pdf2unicode: ClassVar[dict[int, str]] = {}
90 for name, std, mac, win, pdf in ENCODING:
91 c = name2unicode(name)
92 if std:
93 std2unicode[std] = c
94 if mac:
95 mac2unicode[mac] = c
96 if win:
97 win2unicode[win] = c
98 if pdf:
99 pdf2unicode[pdf] = c
101 encodings: ClassVar[dict[str, dict[int, str]]] = {
102 "StandardEncoding": std2unicode,
103 "MacRomanEncoding": mac2unicode,
104 "WinAnsiEncoding": win2unicode,
105 "PDFDocEncoding": pdf2unicode,
106 }
108 @classmethod
109 def get_encoding(
110 cls,
111 name: str,
112 diff: Iterable[object] | None = None,
113 ) -> dict[int, str]:
114 cid2unicode = cls.encodings.get(name, cls.std2unicode)
115 if diff:
116 cid2unicode = cid2unicode.copy()
117 cid = 0
118 for x in diff:
119 if isinstance(x, int):
120 cid = x
121 elif isinstance(x, PSLiteral):
122 try:
123 cid2unicode[cid] = name2unicode(cast(str, x.name))
124 except (KeyError, ValueError) as e:
125 log.debug(str(e))
126 cid += 1
127 return cid2unicode