Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/utf1632prober.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

127 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# 

3# Contributor(s): 

4# Jason Zavaglia 

5# 

6# This library is free software; you can redistribute it and/or 

7# modify it under the terms of the GNU Lesser General Public 

8# License as published by the Free Software Foundation; either 

9# version 2.1 of the License, or (at your option) any later version. 

10# 

11# This library is distributed in the hope that it will be useful, 

12# but WITHOUT ANY WARRANTY; without even the implied warranty of 

13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

14# Lesser General Public License for more details. 

15# 

16# You should have received a copy of the GNU Lesser General Public 

17# License along with this library; if not, see 

18# <https://www.gnu.org/licenses/>. 

19######################### END LICENSE BLOCK ######################### 

20from typing import Union 

21 

22from .charsetprober import CharSetProber 

23from .enums import ProbingState 

24 

25 

26class UTF1632Prober(CharSetProber): 

27 """ 

28 This class simply looks for occurrences of zero bytes, and infers 

29 whether the file is UTF16 or UTF32 (low-endian or big-endian) 

30 For instance, files looking like ( \0 \0 \0 [nonzero] )+ 

31 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+ 

32 may be guessed to be UTF16BE, and inversely for little-endian varieties. 

33 """ 

34 

35 # how many logical characters to scan before feeling confident of prediction 

36 MIN_CHARS_FOR_DETECTION = 20 

37 # a fixed constant ratio of expected zeros or non-zeros in modulo-position. 

38 # For ASCII-heavy text in UTF-16/32 

39 EXPECTED_RATIO = 0.94 

40 # Minimum ratio for non-ASCII text (e.g., CJK characters in UTF-16) 

41 # CJK text has fewer null bytes but still shows clear alternating patterns 

42 # compared to random binary data 

43 MIN_RATIO = 0.08 

44 

45 def __init__(self) -> None: 

46 super().__init__() 

47 self.position = 0 

48 self.zeros_at_mod = [0] * 4 

49 self.nonzeros_at_mod = [0] * 4 

50 self._state = ProbingState.DETECTING 

51 self.quad = [0, 0, 0, 0] 

52 self.invalid_utf16be = False 

53 self.invalid_utf16le = False 

54 self.invalid_utf32be = False 

55 self.invalid_utf32le = False 

56 self.first_half_surrogate_pair_detected_16be = False 

57 self.first_half_surrogate_pair_detected_16le = False 

58 self.reset() 

59 

60 def reset(self) -> None: 

61 super().reset() 

62 self.position = 0 

63 self.zeros_at_mod = [0] * 4 

64 self.nonzeros_at_mod = [0] * 4 

65 self._state = ProbingState.DETECTING 

66 self.invalid_utf16be = False 

67 self.invalid_utf16le = False 

68 self.invalid_utf32be = False 

69 self.invalid_utf32le = False 

70 self.first_half_surrogate_pair_detected_16be = False 

71 self.first_half_surrogate_pair_detected_16le = False 

72 self.quad = [0, 0, 0, 0] 

73 

74 @property 

75 def charset_name(self) -> str: 

76 if self.is_likely_utf32be(): 

77 return "utf-32be" 

78 if self.is_likely_utf32le(): 

79 return "utf-32le" 

80 if self.is_likely_utf16be(): 

81 return "utf-16be" 

82 if self.is_likely_utf16le(): 

83 return "utf-16le" 

84 # default to something valid 

85 return "utf-16" 

86 

87 @property 

88 def language(self) -> str: 

89 return "" 

90 

91 def approx_32bit_chars(self) -> float: 

92 return max(1.0, self.position / 4.0) 

93 

94 def approx_16bit_chars(self) -> float: 

95 return max(1.0, self.position / 2.0) 

96 

97 def is_likely_utf32be(self) -> bool: 

98 approx_chars = self.approx_32bit_chars() 

99 if approx_chars < self.MIN_CHARS_FOR_DETECTION: 

100 return False 

101 

102 # For UTF-32BE: first 3 bytes (0,1,2) often zero, last byte (3) non-zero 

103 zero_012_ratio = ( 

104 self.zeros_at_mod[0] + self.zeros_at_mod[1] + self.zeros_at_mod[2] 

105 ) / (approx_chars * 3) 

106 nonzero_3_ratio = self.nonzeros_at_mod[3] / approx_chars 

107 

108 return ( 

109 zero_012_ratio > self.MIN_RATIO * 3 

110 and nonzero_3_ratio > self.EXPECTED_RATIO 

111 and not self.invalid_utf32be 

112 ) 

113 

114 def is_likely_utf32le(self) -> bool: 

115 approx_chars = self.approx_32bit_chars() 

116 if approx_chars < self.MIN_CHARS_FOR_DETECTION: 

117 return False 

118 

119 # For UTF-32LE: first byte (0) non-zero, bytes 1,2,3 often zero 

120 nonzero_0_ratio = self.nonzeros_at_mod[0] / approx_chars 

121 zero_123_ratio = ( 

122 self.zeros_at_mod[1] + self.zeros_at_mod[2] + self.zeros_at_mod[3] 

123 ) / (approx_chars * 3) 

124 

125 return ( 

126 nonzero_0_ratio > self.EXPECTED_RATIO 

127 and zero_123_ratio > self.MIN_RATIO * 3 

128 and not self.invalid_utf32le 

129 ) 

130 

131 def is_likely_utf16be(self) -> bool: 

132 approx_chars = self.approx_16bit_chars() 

133 if approx_chars < self.MIN_CHARS_FOR_DETECTION: 

134 return False 

135 

136 nonzero_ratio = ( 

137 self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3] 

138 ) / approx_chars 

139 zero_ratio = (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars 

140 

141 # For UTF-16BE, odd positions should be non-zero, even positions should have zeros 

142 return ( 

143 nonzero_ratio > self.EXPECTED_RATIO 

144 and zero_ratio > self.MIN_RATIO 

145 and not self.invalid_utf16be 

146 ) 

147 

148 def is_likely_utf16le(self) -> bool: 

149 approx_chars = self.approx_16bit_chars() 

150 if approx_chars < self.MIN_CHARS_FOR_DETECTION: 

151 return False 

152 

153 nonzero_ratio = ( 

154 self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2] 

155 ) / approx_chars 

156 zero_ratio = (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars 

157 

158 # For UTF-16LE, even positions should be non-zero, odd positions should have zeros 

159 # ASCII-heavy: both ratios > 94% 

160 # CJK-heavy: nonzero_ratio > 94%, zero_ratio might be low but > 20% 

161 return ( 

162 nonzero_ratio > self.EXPECTED_RATIO 

163 and zero_ratio > self.MIN_RATIO 

164 and not self.invalid_utf16le 

165 ) 

166 

167 def validate_utf32_characters(self, quad: list[int]) -> None: 

168 """ 

169 Validate if the quad of bytes is valid UTF-32. 

170 

171 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF 

172 excluding 0x0000D800 - 0x0000DFFF 

173 

174 https://en.wikipedia.org/wiki/UTF-32 

175 """ 

176 if ( 

177 quad[0] != 0 

178 or quad[1] > 0x10 

179 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF) 

180 ): 

181 self.invalid_utf32be = True 

182 if ( 

183 quad[3] != 0 

184 or quad[2] > 0x10 

185 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF) 

186 ): 

187 self.invalid_utf32le = True 

188 

189 def validate_utf16_characters(self, pair: list[int]) -> None: 

190 """ 

191 Validate if the pair of bytes is valid UTF-16. 

192 

193 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF 

194 with an exception for surrogate pairs, which must be in the range 

195 0xD800-0xDBFF followed by 0xDC00-0xDFFF 

196 

197 https://en.wikipedia.org/wiki/UTF-16 

198 """ 

199 if not self.first_half_surrogate_pair_detected_16be: 

200 if 0xD8 <= pair[0] <= 0xDB: 

201 self.first_half_surrogate_pair_detected_16be = True 

202 elif 0xDC <= pair[0] <= 0xDF: 

203 self.invalid_utf16be = True 

204 else: 

205 if 0xDC <= pair[0] <= 0xDF: 

206 self.first_half_surrogate_pair_detected_16be = False 

207 else: 

208 self.invalid_utf16be = True 

209 

210 if not self.first_half_surrogate_pair_detected_16le: 

211 if 0xD8 <= pair[1] <= 0xDB: 

212 self.first_half_surrogate_pair_detected_16le = True 

213 elif 0xDC <= pair[1] <= 0xDF: 

214 self.invalid_utf16le = True 

215 else: 

216 if 0xDC <= pair[1] <= 0xDF: 

217 self.first_half_surrogate_pair_detected_16le = False 

218 else: 

219 self.invalid_utf16le = True 

220 

221 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

222 for c in byte_str: 

223 mod4 = self.position % 4 

224 self.quad[mod4] = c 

225 if mod4 == 3: 

226 self.validate_utf32_characters(self.quad) 

227 self.validate_utf16_characters(self.quad[0:2]) 

228 self.validate_utf16_characters(self.quad[2:4]) 

229 if c == 0: 

230 self.zeros_at_mod[mod4] += 1 

231 else: 

232 self.nonzeros_at_mod[mod4] += 1 

233 self.position += 1 

234 return self.state 

235 

236 @property 

237 def state(self) -> ProbingState: 

238 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: 

239 # terminal, decided states 

240 return self._state 

241 if self.get_confidence() > 0.80: 

242 self._state = ProbingState.FOUND_IT 

243 elif self.position > 4 * 1024: 

244 # if we get to 4kb into the file, and we can't conclude it's UTF, 

245 # let's give up 

246 self._state = ProbingState.NOT_ME 

247 return self._state 

248 

249 def get_confidence(self) -> float: 

250 return ( 

251 0.85 

252 if ( 

253 self.is_likely_utf16le() 

254 or self.is_likely_utf16be() 

255 or self.is_likely_utf32le() 

256 or self.is_likely_utf32be() 

257 ) 

258 else 0.00 

259 )