Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 80%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

69 statements  

1import logging 

2import re 

3from collections.abc import Iterable 

4from typing import ClassVar, cast 

5 

6from pdfminer.glyphlist import glyphname2unicode 

7from pdfminer.latin_enc import ENCODING 

8from pdfminer.pdfexceptions import PDFKeyError 

9from pdfminer.psparser import PSLiteral 

10 

11HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") 

12 

13log = logging.getLogger(__name__) 

14 

15 

16def name2unicode(name: str) -> str: 

17 """Converts Adobe glyph names to Unicode numbers. 

18 

19 In contrast to the specification, this raises a KeyError instead of return 

20 an empty string when the key is unknown. 

21 This way the caller must explicitly define what to do 

22 when there is not a match. 

23 

24 Reference: 

25 https://github.com/adobe-type-tools/agl-specification#2-the-mapping 

26 

27 :returns unicode character if name resembles something, 

28 otherwise a KeyError 

29 """ 

30 if not isinstance(name, str): 

31 raise PDFKeyError( 

32 f'Could not convert unicode name "{name}" to character because ' 

33 f"it should be of type str but is of type {type(name)}", 

34 ) 

35 

36 name = name.split(".")[0] 

37 components = name.split("_") 

38 

39 if len(components) > 1: 

40 return "".join(map(name2unicode, components)) 

41 

42 elif name in glyphname2unicode: 

43 return glyphname2unicode[name] 

44 

45 elif name.startswith("uni"): 

46 name_without_uni = name.strip("uni") 

47 

48 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: 

49 unicode_digits = [ 

50 int(name_without_uni[i : i + 4], base=16) 

51 for i in range(0, len(name_without_uni), 4) 

52 ] 

53 for digit in unicode_digits: 

54 raise_key_error_for_invalid_unicode(digit) 

55 characters = map(chr, unicode_digits) 

56 return "".join(characters) 

57 

58 elif name.startswith("u"): 

59 name_without_u = name.strip("u") 

60 

61 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: 

62 unicode_digit = int(name_without_u, base=16) 

63 raise_key_error_for_invalid_unicode(unicode_digit) 

64 return chr(unicode_digit) 

65 

66 raise PDFKeyError( 

67 f'Could not convert unicode name "{name}" to character because ' 

68 "it does not match specification", 

69 ) 

70 

71 

72def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: 

73 """Unicode values should not be in the range D800 through DFFF because 

74 that is used for surrogate pairs in UTF-16 

75 

76 :raises KeyError if unicode digit is invalid 

77 """ 

78 if 55295 < unicode_digit < 57344: 

79 raise PDFKeyError( 

80 f"Unicode digit {unicode_digit} is invalid because " 

81 "it is in the range D800 through DFFF", 

82 ) 

83 

84 

85class EncodingDB: 

86 std2unicode: ClassVar[dict[int, str]] = {} 

87 mac2unicode: ClassVar[dict[int, str]] = {} 

88 win2unicode: ClassVar[dict[int, str]] = {} 

89 pdf2unicode: ClassVar[dict[int, str]] = {} 

90 for name, std, mac, win, pdf in ENCODING: 

91 c = name2unicode(name) 

92 if std: 

93 std2unicode[std] = c 

94 if mac: 

95 mac2unicode[mac] = c 

96 if win: 

97 win2unicode[win] = c 

98 if pdf: 

99 pdf2unicode[pdf] = c 

100 

101 encodings: ClassVar[dict[str, dict[int, str]]] = { 

102 "StandardEncoding": std2unicode, 

103 "MacRomanEncoding": mac2unicode, 

104 "WinAnsiEncoding": win2unicode, 

105 "PDFDocEncoding": pdf2unicode, 

106 } 

107 

108 @classmethod 

109 def get_encoding( 

110 cls, 

111 name: str, 

112 diff: Iterable[object] | None = None, 

113 ) -> dict[int, str]: 

114 cid2unicode = cls.encodings.get(name, cls.std2unicode) 

115 if diff: 

116 cid2unicode = cid2unicode.copy() 

117 cid = 0 

118 for x in diff: 

119 if isinstance(x, int): 

120 cid = x 

121 elif isinstance(x, PSLiteral): 

122 try: 

123 cid2unicode[cid] = name2unicode(cast(str, x.name)) 

124 except (KeyError, ValueError) as e: 

125 log.debug(str(e)) 

126 cid += 1 

127 return cid2unicode