Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/encodingdb.py: 84%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

68 statements  

1import logging 

2import re 

3from typing import Dict, Iterable, Optional, cast 

4 

5from pdfminer.glyphlist import glyphname2unicode 

6from pdfminer.latin_enc import ENCODING 

7from pdfminer.pdfexceptions import PDFKeyError 

8from pdfminer.psparser import PSLiteral 

9 

10HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") 

11 

12log = logging.getLogger(__name__) 

13 

14 

15def name2unicode(name: str) -> str: 

16 """Converts Adobe glyph names to Unicode numbers. 

17 

18 In contrast to the specification, this raises a KeyError instead of return 

19 an empty string when the key is unknown. 

20 This way the caller must explicitly define what to do 

21 when there is not a match. 

22 

23 Reference: 

24 https://github.com/adobe-type-tools/agl-specification#2-the-mapping 

25 

26 :returns unicode character if name resembles something, 

27 otherwise a KeyError 

28 """ 

29 if not isinstance(name, str): 

30 raise PDFKeyError( 

31 'Could not convert unicode name "%s" to character because ' 

32 "it should be of type str but is of type %s" % (name, type(name)), 

33 ) 

34 

35 name = name.split(".")[0] 

36 components = name.split("_") 

37 

38 if len(components) > 1: 

39 return "".join(map(name2unicode, components)) 

40 

41 elif name in glyphname2unicode: 

42 return glyphname2unicode[name] 

43 

44 elif name.startswith("uni"): 

45 name_without_uni = name.strip("uni") 

46 

47 if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: 

48 unicode_digits = [ 

49 int(name_without_uni[i : i + 4], base=16) 

50 for i in range(0, len(name_without_uni), 4) 

51 ] 

52 for digit in unicode_digits: 

53 raise_key_error_for_invalid_unicode(digit) 

54 characters = map(chr, unicode_digits) 

55 return "".join(characters) 

56 

57 elif name.startswith("u"): 

58 name_without_u = name.strip("u") 

59 

60 if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: 

61 unicode_digit = int(name_without_u, base=16) 

62 raise_key_error_for_invalid_unicode(unicode_digit) 

63 return chr(unicode_digit) 

64 

65 raise PDFKeyError( 

66 'Could not convert unicode name "%s" to character because ' 

67 "it does not match specification" % name, 

68 ) 

69 

70 

71def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: 

72 """Unicode values should not be in the range D800 through DFFF because 

73 that is used for surrogate pairs in UTF-16 

74 

75 :raises KeyError if unicode digit is invalid 

76 """ 

77 if 55295 < unicode_digit < 57344: 

78 raise PDFKeyError( 

79 "Unicode digit %d is invalid because " 

80 "it is in the range D800 through DFFF" % unicode_digit, 

81 ) 

82 

83 

84class EncodingDB: 

85 std2unicode: Dict[int, str] = {} 

86 mac2unicode: Dict[int, str] = {} 

87 win2unicode: Dict[int, str] = {} 

88 pdf2unicode: Dict[int, str] = {} 

89 for name, std, mac, win, pdf in ENCODING: 

90 c = name2unicode(name) 

91 if std: 

92 std2unicode[std] = c 

93 if mac: 

94 mac2unicode[mac] = c 

95 if win: 

96 win2unicode[win] = c 

97 if pdf: 

98 pdf2unicode[pdf] = c 

99 

100 encodings = { 

101 "StandardEncoding": std2unicode, 

102 "MacRomanEncoding": mac2unicode, 

103 "WinAnsiEncoding": win2unicode, 

104 "PDFDocEncoding": pdf2unicode, 

105 } 

106 

107 @classmethod 

108 def get_encoding( 

109 cls, 

110 name: str, 

111 diff: Optional[Iterable[object]] = None, 

112 ) -> Dict[int, str]: 

113 cid2unicode = cls.encodings.get(name, cls.std2unicode) 

114 if diff: 

115 cid2unicode = cid2unicode.copy() 

116 cid = 0 

117 for x in diff: 

118 if isinstance(x, int): 

119 cid = x 

120 elif isinstance(x, PSLiteral): 

121 try: 

122 cid2unicode[cid] = name2unicode(cast(str, x.name)) 

123 except (KeyError, ValueError) as e: 

124 log.debug(str(e)) 

125 cid += 1 

126 return cid2unicode