Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/utf1632prober.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

109 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# 

3# Contributor(s): 

4# Jason Zavaglia 

5# 

6# This library is free software; you can redistribute it and/or 

7# modify it under the terms of the GNU Lesser General Public 

8# License as published by the Free Software Foundation; either 

9# version 2.1 of the License, or (at your option) any later version. 

10# 

11# This library is distributed in the hope that it will be useful, 

12# but WITHOUT ANY WARRANTY; without even the implied warranty of 

13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

14# Lesser General Public License for more details. 

15# 

16# You should have received a copy of the GNU Lesser General Public 

17# License along with this library; if not, write to the Free Software 

18# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

19# 02110-1301 USA 

20######################### END LICENSE BLOCK ######################### 

21from typing import List, Union 

22 

23from .charsetprober import CharSetProber 

24from .enums import ProbingState 

25 

26 

27class UTF1632Prober(CharSetProber): 

28 """ 

29 This class simply looks for occurrences of zero bytes, and infers 

30 whether the file is UTF16 or UTF32 (low-endian or big-endian) 

31 For instance, files looking like ( \0 \0 \0 [nonzero] )+ 

32 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+ 

33 may be guessed to be UTF16BE, and inversely for little-endian varieties. 

34 """ 

35 

36 # how many logical characters to scan before feeling confident of prediction 

37 MIN_CHARS_FOR_DETECTION = 20 

38 # a fixed constant ratio of expected zeros or non-zeros in modulo-position. 

39 EXPECTED_RATIO = 0.94 

40 

41 def __init__(self) -> None: 

42 super().__init__() 

43 self.position = 0 

44 self.zeros_at_mod = [0] * 4 

45 self.nonzeros_at_mod = [0] * 4 

46 self._state = ProbingState.DETECTING 

47 self.quad = [0, 0, 0, 0] 

48 self.invalid_utf16be = False 

49 self.invalid_utf16le = False 

50 self.invalid_utf32be = False 

51 self.invalid_utf32le = False 

52 self.first_half_surrogate_pair_detected_16be = False 

53 self.first_half_surrogate_pair_detected_16le = False 

54 self.reset() 

55 

56 def reset(self) -> None: 

57 super().reset() 

58 self.position = 0 

59 self.zeros_at_mod = [0] * 4 

60 self.nonzeros_at_mod = [0] * 4 

61 self._state = ProbingState.DETECTING 

62 self.invalid_utf16be = False 

63 self.invalid_utf16le = False 

64 self.invalid_utf32be = False 

65 self.invalid_utf32le = False 

66 self.first_half_surrogate_pair_detected_16be = False 

67 self.first_half_surrogate_pair_detected_16le = False 

68 self.quad = [0, 0, 0, 0] 

69 

70 @property 

71 def charset_name(self) -> str: 

72 if self.is_likely_utf32be(): 

73 return "utf-32be" 

74 if self.is_likely_utf32le(): 

75 return "utf-32le" 

76 if self.is_likely_utf16be(): 

77 return "utf-16be" 

78 if self.is_likely_utf16le(): 

79 return "utf-16le" 

80 # default to something valid 

81 return "utf-16" 

82 

83 @property 

84 def language(self) -> str: 

85 return "" 

86 

87 def approx_32bit_chars(self) -> float: 

88 return max(1.0, self.position / 4.0) 

89 

90 def approx_16bit_chars(self) -> float: 

91 return max(1.0, self.position / 2.0) 

92 

93 def is_likely_utf32be(self) -> bool: 

94 approx_chars = self.approx_32bit_chars() 

95 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

96 self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO 

97 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO 

98 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO 

99 and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO 

100 and not self.invalid_utf32be 

101 ) 

102 

103 def is_likely_utf32le(self) -> bool: 

104 approx_chars = self.approx_32bit_chars() 

105 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

106 self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO 

107 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO 

108 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO 

109 and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO 

110 and not self.invalid_utf32le 

111 ) 

112 

113 def is_likely_utf16be(self) -> bool: 

114 approx_chars = self.approx_16bit_chars() 

115 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

116 (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars 

117 > self.EXPECTED_RATIO 

118 and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars 

119 > self.EXPECTED_RATIO 

120 and not self.invalid_utf16be 

121 ) 

122 

123 def is_likely_utf16le(self) -> bool: 

124 approx_chars = self.approx_16bit_chars() 

125 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

126 (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars 

127 > self.EXPECTED_RATIO 

128 and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars 

129 > self.EXPECTED_RATIO 

130 and not self.invalid_utf16le 

131 ) 

132 

133 def validate_utf32_characters(self, quad: List[int]) -> None: 

134 """ 

135 Validate if the quad of bytes is valid UTF-32. 

136 

137 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF 

138 excluding 0x0000D800 - 0x0000DFFF 

139 

140 https://en.wikipedia.org/wiki/UTF-32 

141 """ 

142 if ( 

143 quad[0] != 0 

144 or quad[1] > 0x10 

145 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF) 

146 ): 

147 self.invalid_utf32be = True 

148 if ( 

149 quad[3] != 0 

150 or quad[2] > 0x10 

151 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF) 

152 ): 

153 self.invalid_utf32le = True 

154 

155 def validate_utf16_characters(self, pair: List[int]) -> None: 

156 """ 

157 Validate if the pair of bytes is valid UTF-16. 

158 

159 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF 

160 with an exception for surrogate pairs, which must be in the range 

161 0xD800-0xDBFF followed by 0xDC00-0xDFFF 

162 

163 https://en.wikipedia.org/wiki/UTF-16 

164 """ 

165 if not self.first_half_surrogate_pair_detected_16be: 

166 if 0xD8 <= pair[0] <= 0xDB: 

167 self.first_half_surrogate_pair_detected_16be = True 

168 elif 0xDC <= pair[0] <= 0xDF: 

169 self.invalid_utf16be = True 

170 else: 

171 if 0xDC <= pair[0] <= 0xDF: 

172 self.first_half_surrogate_pair_detected_16be = False 

173 else: 

174 self.invalid_utf16be = True 

175 

176 if not self.first_half_surrogate_pair_detected_16le: 

177 if 0xD8 <= pair[1] <= 0xDB: 

178 self.first_half_surrogate_pair_detected_16le = True 

179 elif 0xDC <= pair[1] <= 0xDF: 

180 self.invalid_utf16le = True 

181 else: 

182 if 0xDC <= pair[1] <= 0xDF: 

183 self.first_half_surrogate_pair_detected_16le = False 

184 else: 

185 self.invalid_utf16le = True 

186 

187 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

188 for c in byte_str: 

189 mod4 = self.position % 4 

190 self.quad[mod4] = c 

191 if mod4 == 3: 

192 self.validate_utf32_characters(self.quad) 

193 self.validate_utf16_characters(self.quad[0:2]) 

194 self.validate_utf16_characters(self.quad[2:4]) 

195 if c == 0: 

196 self.zeros_at_mod[mod4] += 1 

197 else: 

198 self.nonzeros_at_mod[mod4] += 1 

199 self.position += 1 

200 return self.state 

201 

202 @property 

203 def state(self) -> ProbingState: 

204 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: 

205 # terminal, decided states 

206 return self._state 

207 if self.get_confidence() > 0.80: 

208 self._state = ProbingState.FOUND_IT 

209 elif self.position > 4 * 1024: 

210 # if we get to 4kb into the file, and we can't conclude it's UTF, 

211 # let's give up 

212 self._state = ProbingState.NOT_ME 

213 return self._state 

214 

215 def get_confidence(self) -> float: 

216 return ( 

217 0.85 

218 if ( 

219 self.is_likely_utf16le() 

220 or self.is_likely_utf16be() 

221 or self.is_likely_utf32le() 

222 or self.is_likely_utf32be() 

223 ) 

224 else 0.00 

225 )