Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/macromanprober.py: 94%

1######################## BEGIN LICENSE BLOCK ########################

2# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.

3# The Original Code is Mozilla Universal charset detector code.

5# The Initial Developer of the Original Code is

6# Netscape Communications Corporation.

10# Contributor(s):

11# Rob Speer - adapt to MacRoman encoding

12# Mark Pilgrim - port to Python

13# Shy Shalom - original C code

14#

15# This library is free software; you can redistribute it and/or

16# modify it under the terms of the GNU Lesser General Public

17# License as published by the Free Software Foundation; either

18# version 2.1 of the License, or (at your option) any later version.

19#

20# This library is distributed in the hope that it will be useful,

21# but WITHOUT ANY WARRANTY; without even the implied warranty of

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

23# Lesser General Public License for more details.

24#

25# You should have received a copy of the GNU Lesser General Public

26# License along with this library; if not, write to the Free Software

27# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

28# 02110-1301 USA

29######################### END LICENSE BLOCK #########################

31from typing import List, Union

33from .charsetprober import CharSetProber

34from .enums import ProbingState

36FREQ_CAT_NUM = 4

38UDF = 0 # undefined

39OTH = 1 # other

40ASC = 2 # ascii capital letter

41ASS = 3 # ascii small letter

42ACV = 4 # accent capital vowel

43ACO = 5 # accent capital other

44ASV = 6 # accent small vowel

45ASO = 7 # accent small other

46ODD = 8 # character that is unlikely to appear

47CLASS_NUM = 9 # total classes

49# The change from Latin1 is that we explicitly look for extended characters

50# that are infrequently-occurring symbols, and consider them to always be

51# improbable. This should let MacRoman get out of the way of more likely

52# encodings in most situations.

54# fmt: off

55MacRoman_CharToClass = (

56 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07

57 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F

58 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17

59 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F

60 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27

61 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F

62 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37

63 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F

64 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47

65 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F

66 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57

67 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F

68 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67

69 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F

70 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77

71 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F

72 ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87

73 ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F

74 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97

75 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F

76 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7

77 OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF

78 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7

79 OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF

80 OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7

81 OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF

82 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7

83 ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF

84 OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7

85 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF

86 ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7

87 ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF

88)

90# 0 : illegal

91# 1 : very unlikely

92# 2 : normal

93# 3 : very likely

94MacRomanClassModel = (

95# UDF OTH ASC ASS ACV ACO ASV ASO ODD

96 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF

97 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH

98 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC

99 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS

100 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV

101 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO

102 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV

103 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO

104 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD

105)

106# fmt: on

107

108

109class MacRomanProber(CharSetProber):

110 def __init__(self) -> None:

111 super().__init__()

112 self._last_char_class = OTH

113 self._freq_counter: List[int] = []

114 self.reset()

115

116 def reset(self) -> None:

117 self._last_char_class = OTH

118 self._freq_counter = [0] * FREQ_CAT_NUM

119

120 # express the prior that MacRoman is a somewhat rare encoding;

121 # this can be done by starting out in a slightly improbable state

122 # that must be overcome

123 self._freq_counter[2] = 10

124

125 super().reset()

126

127 @property

128 def charset_name(self) -> str:

129 return "MacRoman"

130

131 @property

132 def language(self) -> str:

133 return ""

134

135 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

136 byte_str = self.remove_xml_tags(byte_str)

137 for c in byte_str:

138 char_class = MacRoman_CharToClass[c]

139 freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]

140 if freq == 0:

141 self._state = ProbingState.NOT_ME

142 break

143 self._freq_counter[freq] += 1

144 self._last_char_class = char_class

145

146 return self.state

147

148 def get_confidence(self) -> float:

149 if self.state == ProbingState.NOT_ME:

150 return 0.01

151

152 total = sum(self._freq_counter)

153 confidence = (

154 0.0

155 if total < 0.01

156 else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total

157 )

158 confidence = max(confidence, 0.0)

159 # lower the confidence of MacRoman so that other more accurate

160 # detector can take priority.

161 confidence *= 0.73

162 return confidence