Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 84%

1import logging

2import re

3from typing import Dict, Iterable, Optional, cast

5from pdfminer.glyphlist import glyphname2unicode

6from pdfminer.latin_enc import ENCODING

7from pdfminer.pdfexceptions import PDFKeyError

8from pdfminer.psparser import PSLiteral

10HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")

12log = logging.getLogger(__name__)

15def name2unicode(name: str) -> str:

16 """Converts Adobe glyph names to Unicode numbers.

18 In contrast to the specification, this raises a KeyError instead of return

19 an empty string when the key is unknown.

20 This way the caller must explicitly define what to do

21 when there is not a match.

23 Reference:

24 https://github.com/adobe-type-tools/agl-specification#2-the-mapping

26 :returns unicode character if name resembles something,

27 otherwise a KeyError

28 """

29 if not isinstance(name, str):

30 raise PDFKeyError(

31 'Could not convert unicode name "%s" to character because '

32 "it should be of type str but is of type %s" % (name, type(name)),

33 )

35 name = name.split(".")[0]

36 components = name.split("_")

38 if len(components) > 1:

39 return "".join(map(name2unicode, components))

41 elif name in glyphname2unicode:

42 return glyphname2unicode[name]

44 elif name.startswith("uni"):

45 name_without_uni = name.strip("uni")

47 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:

48 unicode_digits = [

49 int(name_without_uni[i : i + 4], base=16)

50 for i in range(0, len(name_without_uni), 4)

51 ]

52 for digit in unicode_digits:

53 raise_key_error_for_invalid_unicode(digit)

54 characters = map(chr, unicode_digits)

55 return "".join(characters)

57 elif name.startswith("u"):

58 name_without_u = name.strip("u")

60 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:

61 unicode_digit = int(name_without_u, base=16)

62 raise_key_error_for_invalid_unicode(unicode_digit)

63 return chr(unicode_digit)

65 raise PDFKeyError(

66 'Could not convert unicode name "%s" to character because '

67 "it does not match specification" % name,

68 )

71def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:

72 """Unicode values should not be in the range D800 through DFFF because

73 that is used for surrogate pairs in UTF-16

75 :raises KeyError if unicode digit is invalid

76 """

77 if 55295 < unicode_digit < 57344:

78 raise PDFKeyError(

79 "Unicode digit %d is invalid because "

80 "it is in the range D800 through DFFF" % unicode_digit,

81 )

84class EncodingDB:

85 std2unicode: Dict[int, str] = {}

86 mac2unicode: Dict[int, str] = {}

87 win2unicode: Dict[int, str] = {}

88 pdf2unicode: Dict[int, str] = {}

89 for name, std, mac, win, pdf in ENCODING:

90 c = name2unicode(name)

91 if std:

92 std2unicode[std] = c

93 if mac:

94 mac2unicode[mac] = c

95 if win:

96 win2unicode[win] = c

97 if pdf:

98 pdf2unicode[pdf] = c

100 encodings = {

101 "StandardEncoding": std2unicode,

102 "MacRomanEncoding": mac2unicode,

103 "WinAnsiEncoding": win2unicode,

104 "PDFDocEncoding": pdf2unicode,

105 }

106

107 @classmethod

108 def get_encoding(

109 cls,

110 name: str,

111 diff: Optional[Iterable[object]] = None,

112 ) -> Dict[int, str]:

113 cid2unicode = cls.encodings.get(name, cls.std2unicode)

114 if diff:

115 cid2unicode = cid2unicode.copy()

116 cid = 0

117 for x in diff:

118 if isinstance(x, int):

119 cid = x

120 elif isinstance(x, PSLiteral):

121 try:

122 cid2unicode[cid] = name2unicode(cast(str, x.name))

123 except (KeyError, ValueError) as e:

124 log.debug(str(e))

125 cid += 1

126 return cid2unicode