Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 80%

1import logging

2import re

3from collections.abc import Iterable

4from typing import ClassVar, cast

6from pdfminer.glyphlist import glyphname2unicode

7from pdfminer.latin_enc import ENCODING

8from pdfminer.pdfexceptions import PDFKeyError

9from pdfminer.psparser import PSLiteral

11HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")

13log = logging.getLogger(__name__)

16def name2unicode(name: str) -> str:

17 """Converts Adobe glyph names to Unicode numbers.

19 In contrast to the specification, this raises a KeyError instead of return

20 an empty string when the key is unknown.

21 This way the caller must explicitly define what to do

22 when there is not a match.

24 Reference:

25 https://github.com/adobe-type-tools/agl-specification#2-the-mapping

27 :returns unicode character if name resembles something,

28 otherwise a KeyError

29 """

30 if not isinstance(name, str):

31 raise PDFKeyError(

32 f'Could not convert unicode name "{name}" to character because '

33 f"it should be of type str but is of type {type(name)}",

34 )

36 name = name.split(".")[0]

37 components = name.split("_")

39 if len(components) > 1:

40 return "".join(map(name2unicode, components))

42 elif name in glyphname2unicode:

43 return glyphname2unicode[name]

45 elif name.startswith("uni"):

46 name_without_uni = name.strip("uni")

48 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:

49 unicode_digits = [

50 int(name_without_uni[i : i + 4], base=16)

51 for i in range(0, len(name_without_uni), 4)

52 ]

53 for digit in unicode_digits:

54 raise_key_error_for_invalid_unicode(digit)

55 characters = map(chr, unicode_digits)

56 return "".join(characters)

58 elif name.startswith("u"):

59 name_without_u = name.strip("u")

61 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:

62 unicode_digit = int(name_without_u, base=16)

63 raise_key_error_for_invalid_unicode(unicode_digit)

64 return chr(unicode_digit)

66 raise PDFKeyError(

67 f'Could not convert unicode name "{name}" to character because '

68 "it does not match specification",

69 )

72def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:

73 """Unicode values should not be in the range D800 through DFFF because

74 that is used for surrogate pairs in UTF-16

76 :raises KeyError if unicode digit is invalid

77 """

78 if 55295 < unicode_digit < 57344:

79 raise PDFKeyError(

80 f"Unicode digit {unicode_digit} is invalid because "

81 "it is in the range D800 through DFFF",

82 )

85class EncodingDB:

86 std2unicode: ClassVar[dict[int, str]] = {}

87 mac2unicode: ClassVar[dict[int, str]] = {}

88 win2unicode: ClassVar[dict[int, str]] = {}

89 pdf2unicode: ClassVar[dict[int, str]] = {}

90 for name, std, mac, win, pdf in ENCODING:

91 c = name2unicode(name)

92 if std:

93 std2unicode[std] = c

94 if mac:

95 mac2unicode[mac] = c

96 if win:

97 win2unicode[win] = c

98 if pdf:

99 pdf2unicode[pdf] = c

100

101 encodings: ClassVar[dict[str, dict[int, str]]] = {

102 "StandardEncoding": std2unicode,

103 "MacRomanEncoding": mac2unicode,

104 "WinAnsiEncoding": win2unicode,

105 "PDFDocEncoding": pdf2unicode,

106 }

107

108 @classmethod

109 def get_encoding(

110 cls,

111 name: str,

112 diff: Iterable[object] | None = None,

113 ) -> dict[int, str]:

114 cid2unicode = cls.encodings.get(name, cls.std2unicode)

115 if diff:

116 cid2unicode = cid2unicode.copy()

117 cid = 0

118 for x in diff:

119 if isinstance(x, int):

120 cid = x

121 elif isinstance(x, PSLiteral):

122 try:

123 cid2unicode[cid] = name2unicode(cast(str, x.name))

124 except (KeyError, ValueError) as e:

125 log.debug(str(e))

126 cid += 1

127 return cid2unicode