Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/macromanprober.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

52 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>. 

3# The Original Code is Mozilla Universal charset detector code. 

4# 

5# The Initial Developer of the Original Code is 

6# Netscape Communications Corporation. 

7# Portions created by the Initial Developer are Copyright (C) 2001 

8# the Initial Developer. All Rights Reserved. 

9# 

10# Contributor(s): 

11# Rob Speer - adapt to MacRoman encoding 

12# Mark Pilgrim - port to Python 

13# Shy Shalom - original C code 

14# 

15# This library is free software; you can redistribute it and/or 

16# modify it under the terms of the GNU Lesser General Public 

17# License as published by the Free Software Foundation; either 

18# version 2.1 of the License, or (at your option) any later version. 

19# 

20# This library is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

23# Lesser General Public License for more details. 

24# 

25# You should have received a copy of the GNU Lesser General Public 

26# License along with this library; if not, write to the Free Software 

27# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

28# 02110-1301 USA 

29######################### END LICENSE BLOCK ######################### 

30 

31from typing import List, Union 

32 

33from .charsetprober import CharSetProber 

34from .enums import ProbingState 

35 

36FREQ_CAT_NUM = 4 

37 

38UDF = 0 # undefined 

39OTH = 1 # other 

40ASC = 2 # ascii capital letter 

41ASS = 3 # ascii small letter 

42ACV = 4 # accent capital vowel 

43ACO = 5 # accent capital other 

44ASV = 6 # accent small vowel 

45ASO = 7 # accent small other 

46ODD = 8 # character that is unlikely to appear 

47CLASS_NUM = 9 # total classes 

48 

49# The change from Latin1 is that we explicitly look for extended characters 

50# that are infrequently-occurring symbols, and consider them to always be 

51# improbable. This should let MacRoman get out of the way of more likely 

52# encodings in most situations. 

53 

54# fmt: off 

55MacRoman_CharToClass = ( 

56 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 

57 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F 

58 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 

59 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F 

60 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 

61 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F 

62 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 

63 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F 

64 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 

65 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F 

66 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 

67 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F 

68 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 

69 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F 

70 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 

71 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F 

72 ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87 

73 ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F 

74 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97 

75 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F 

76 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7 

77 OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF 

78 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 

79 OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF 

80 OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7 

81 OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF 

82 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7 

83 ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF 

84 OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7 

85 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF 

86 ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7 

87 ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF 

88) 

89 

90# 0 : illegal 

91# 1 : very unlikely 

92# 2 : normal 

93# 3 : very likely 

94MacRomanClassModel = ( 

95# UDF OTH ASC ASS ACV ACO ASV ASO ODD 

96 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF 

97 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH 

98 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC 

99 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS 

100 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV 

101 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO 

102 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV 

103 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO 

104 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD 

105) 

106# fmt: on 

107 

108 

109class MacRomanProber(CharSetProber): 

110 def __init__(self) -> None: 

111 super().__init__() 

112 self._last_char_class = OTH 

113 self._freq_counter: List[int] = [] 

114 self.reset() 

115 

116 def reset(self) -> None: 

117 self._last_char_class = OTH 

118 self._freq_counter = [0] * FREQ_CAT_NUM 

119 

120 # express the prior that MacRoman is a somewhat rare encoding; 

121 # this can be done by starting out in a slightly improbable state 

122 # that must be overcome 

123 self._freq_counter[2] = 10 

124 

125 super().reset() 

126 

127 @property 

128 def charset_name(self) -> str: 

129 return "MacRoman" 

130 

131 @property 

132 def language(self) -> str: 

133 return "" 

134 

135 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

136 byte_str = self.remove_xml_tags(byte_str) 

137 for c in byte_str: 

138 char_class = MacRoman_CharToClass[c] 

139 freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class] 

140 if freq == 0: 

141 self._state = ProbingState.NOT_ME 

142 break 

143 self._freq_counter[freq] += 1 

144 self._last_char_class = char_class 

145 

146 return self.state 

147 

148 def get_confidence(self) -> float: 

149 if self.state == ProbingState.NOT_ME: 

150 return 0.01 

151 

152 total = sum(self._freq_counter) 

153 confidence = ( 

154 0.0 

155 if total < 0.01 

156 else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total 

157 ) 

158 confidence = max(confidence, 0.0) 

159 # lower the confidence of MacRoman so that other more accurate 

160 # detector can take priority. 

161 confidence *= 0.73 

162 return confidence