Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/utf1632prober.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

110 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# 

3# Contributor(s): 

4# Jason Zavaglia 

5# 

6# This library is free software; you can redistribute it and/or 

7# modify it under the terms of the GNU Lesser General Public 

8# License as published by the Free Software Foundation; either 

9# version 2.1 of the License, or (at your option) any later version. 

10# 

11# This library is distributed in the hope that it will be useful, 

12# but WITHOUT ANY WARRANTY; without even the implied warranty of 

13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

14# Lesser General Public License for more details. 

15# 

16# You should have received a copy of the GNU Lesser General Public 

17# License along with this library; if not, see 

18# <https://www.gnu.org/licenses/>. 

19######################### END LICENSE BLOCK ######################### 

20from typing import List, Union 

21 

22from .charsetprober import CharSetProber 

23from .enums import ProbingState 

24 

25 

26class UTF1632Prober(CharSetProber): 

27 """ 

28 This class simply looks for occurrences of zero bytes, and infers 

29 whether the file is UTF16 or UTF32 (low-endian or big-endian) 

30 For instance, files looking like ( \0 \0 \0 [nonzero] )+ 

31 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+ 

32 may be guessed to be UTF16BE, and inversely for little-endian varieties. 

33 """ 

34 

35 # how many logical characters to scan before feeling confident of prediction 

36 MIN_CHARS_FOR_DETECTION = 20 

37 # a fixed constant ratio of expected zeros or non-zeros in modulo-position. 

38 EXPECTED_RATIO = 0.94 

39 

40 def __init__(self) -> None: 

41 super().__init__() 

42 self.position = 0 

43 self.zeros_at_mod = [0] * 4 

44 self.nonzeros_at_mod = [0] * 4 

45 self._state = ProbingState.DETECTING 

46 self.quad = [0, 0, 0, 0] 

47 self.invalid_utf16be = False 

48 self.invalid_utf16le = False 

49 self.invalid_utf32be = False 

50 self.invalid_utf32le = False 

51 self.first_half_surrogate_pair_detected_16be = False 

52 self.first_half_surrogate_pair_detected_16le = False 

53 self.reset() 

54 

55 def reset(self) -> None: 

56 super().reset() 

57 self.position = 0 

58 self.zeros_at_mod = [0] * 4 

59 self.nonzeros_at_mod = [0] * 4 

60 self._state = ProbingState.DETECTING 

61 self.invalid_utf16be = False 

62 self.invalid_utf16le = False 

63 self.invalid_utf32be = False 

64 self.invalid_utf32le = False 

65 self.first_half_surrogate_pair_detected_16be = False 

66 self.first_half_surrogate_pair_detected_16le = False 

67 self.quad = [0, 0, 0, 0] 

68 

69 @property 

70 def charset_name(self) -> str: 

71 if self.is_likely_utf32be(): 

72 return "utf-32be" 

73 if self.is_likely_utf32le(): 

74 return "utf-32le" 

75 if self.is_likely_utf16be(): 

76 return "utf-16be" 

77 if self.is_likely_utf16le(): 

78 return "utf-16le" 

79 # default to something valid 

80 return "utf-16" 

81 

82 @property 

83 def language(self) -> str: 

84 return "" 

85 

86 def approx_32bit_chars(self) -> float: 

87 return max(1.0, self.position / 4.0) 

88 

89 def approx_16bit_chars(self) -> float: 

90 return max(1.0, self.position / 2.0) 

91 

92 def is_likely_utf32be(self) -> bool: 

93 approx_chars = self.approx_32bit_chars() 

94 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

95 self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO 

96 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO 

97 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO 

98 and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO 

99 and not self.invalid_utf32be 

100 ) 

101 

102 def is_likely_utf32le(self) -> bool: 

103 approx_chars = self.approx_32bit_chars() 

104 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

105 self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO 

106 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO 

107 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO 

108 and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO 

109 and not self.invalid_utf32le 

110 ) 

111 

112 def is_likely_utf16be(self) -> bool: 

113 approx_chars = self.approx_16bit_chars() 

114 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

115 (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars 

116 > self.EXPECTED_RATIO 

117 and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars 

118 > self.EXPECTED_RATIO 

119 and not self.invalid_utf16be 

120 ) 

121 

122 def is_likely_utf16le(self) -> bool: 

123 approx_chars = self.approx_16bit_chars() 

124 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( 

125 (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars 

126 > self.EXPECTED_RATIO 

127 and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars 

128 > self.EXPECTED_RATIO 

129 and not self.invalid_utf16le 

130 ) 

131 

132 def validate_utf32_characters(self, quad: List[int]) -> None: 

133 """ 

134 Validate if the quad of bytes is valid UTF-32. 

135 

136 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF 

137 excluding 0x0000D800 - 0x0000DFFF 

138 

139 https://en.wikipedia.org/wiki/UTF-32 

140 """ 

141 if ( 

142 quad[0] != 0 

143 or quad[1] > 0x10 

144 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF) 

145 ): 

146 self.invalid_utf32be = True 

147 if ( 

148 quad[3] != 0 

149 or quad[2] > 0x10 

150 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF) 

151 ): 

152 self.invalid_utf32le = True 

153 

154 def validate_utf16_characters(self, pair: List[int]) -> None: 

155 """ 

156 Validate if the pair of bytes is valid UTF-16. 

157 

158 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF 

159 with an exception for surrogate pairs, which must be in the range 

160 0xD800-0xDBFF followed by 0xDC00-0xDFFF 

161 

162 https://en.wikipedia.org/wiki/UTF-16 

163 """ 

164 if not self.first_half_surrogate_pair_detected_16be: 

165 if 0xD8 <= pair[0] <= 0xDB: 

166 self.first_half_surrogate_pair_detected_16be = True 

167 elif 0xDC <= pair[0] <= 0xDF: 

168 self.invalid_utf16be = True 

169 else: 

170 if 0xDC <= pair[0] <= 0xDF: 

171 self.first_half_surrogate_pair_detected_16be = False 

172 else: 

173 self.invalid_utf16be = True 

174 

175 if not self.first_half_surrogate_pair_detected_16le: 

176 if 0xD8 <= pair[1] <= 0xDB: 

177 self.first_half_surrogate_pair_detected_16le = True 

178 elif 0xDC <= pair[1] <= 0xDF: 

179 self.invalid_utf16le = True 

180 else: 

181 if 0xDC <= pair[1] <= 0xDF: 

182 self.first_half_surrogate_pair_detected_16le = False 

183 else: 

184 self.invalid_utf16le = True 

185 

186 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

187 for c in byte_str: 

188 mod4 = self.position % 4 

189 self.quad[mod4] = c 

190 if mod4 == 3: 

191 self.validate_utf32_characters(self.quad) 

192 self.validate_utf16_characters(self.quad[0:2]) 

193 self.validate_utf16_characters(self.quad[2:4]) 

194 if c == 0: 

195 self.zeros_at_mod[mod4] += 1 

196 else: 

197 self.nonzeros_at_mod[mod4] += 1 

198 self.position += 1 

199 return self.state 

200 

201 @property 

202 def state(self) -> ProbingState: 

203 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: 

204 # terminal, decided states 

205 return self._state 

206 if self.get_confidence() > 0.80: 

207 self._state = ProbingState.FOUND_IT 

208 elif self.position > 4 * 1024: 

209 # if we get to 4kb into the file, and we can't conclude it's UTF, 

210 # let's give up 

211 self._state = ProbingState.NOT_ME 

212 return self._state 

213 

214 def get_confidence(self) -> float: 

215 return ( 

216 0.85 

217 if ( 

218 self.is_likely_utf16le() 

219 or self.is_likely_utf16be() 

220 or self.is_likely_utf32le() 

221 or self.is_likely_utf32be() 

222 ) 

223 else 0.00 

224 )