Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/utf1632prober.py: 98%

1######################## BEGIN LICENSE BLOCK ########################

3# Contributor(s):

4# Jason Zavaglia

6# This library is free software; you can redistribute it and/or

7# modify it under the terms of the GNU Lesser General Public

8# License as published by the Free Software Foundation; either

9# version 2.1 of the License, or (at your option) any later version.

10#

11# This library is distributed in the hope that it will be useful,

12# but WITHOUT ANY WARRANTY; without even the implied warranty of

13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14# Lesser General Public License for more details.

15#

16# You should have received a copy of the GNU Lesser General Public

17# License along with this library; if not, see

18# <https://www.gnu.org/licenses/>.

19######################### END LICENSE BLOCK #########################

20from typing import Union

22from .charsetprober import CharSetProber

23from .enums import ProbingState

26class UTF1632Prober(CharSetProber):

27 """

28 This class simply looks for occurrences of zero bytes, and infers

29 whether the file is UTF16 or UTF32 (low-endian or big-endian)

30 For instance, files looking like ( \0 \0 \0 [nonzero] )+

31 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+

32 may be guessed to be UTF16BE, and inversely for little-endian varieties.

33 """

35 # how many logical characters to scan before feeling confident of prediction

36 MIN_CHARS_FOR_DETECTION = 20

37 # a fixed constant ratio of expected zeros or non-zeros in modulo-position.

38 # For ASCII-heavy text in UTF-16/32

39 EXPECTED_RATIO = 0.94

40 # Minimum ratio for non-ASCII text (e.g., CJK characters in UTF-16)

41 # CJK text has fewer null bytes but still shows clear alternating patterns

42 # compared to random binary data

43 MIN_RATIO = 0.08

45 def __init__(self) -> None:

46 super().__init__()

47 self.position = 0

48 self.zeros_at_mod = [0] * 4

49 self.nonzeros_at_mod = [0] * 4

50 self._state = ProbingState.DETECTING

51 self.quad = [0, 0, 0, 0]

52 self.invalid_utf16be = False

53 self.invalid_utf16le = False

54 self.invalid_utf32be = False

55 self.invalid_utf32le = False

56 self.first_half_surrogate_pair_detected_16be = False

57 self.first_half_surrogate_pair_detected_16le = False

58 self.reset()

60 def reset(self) -> None:

61 super().reset()

62 self.position = 0

63 self.zeros_at_mod = [0] * 4

64 self.nonzeros_at_mod = [0] * 4

65 self._state = ProbingState.DETECTING

66 self.invalid_utf16be = False

67 self.invalid_utf16le = False

68 self.invalid_utf32be = False

69 self.invalid_utf32le = False

70 self.first_half_surrogate_pair_detected_16be = False

71 self.first_half_surrogate_pair_detected_16le = False

72 self.quad = [0, 0, 0, 0]

74 @property

75 def charset_name(self) -> str:

76 if self.is_likely_utf32be():

77 return "utf-32be"

78 if self.is_likely_utf32le():

79 return "utf-32le"

80 if self.is_likely_utf16be():

81 return "utf-16be"

82 if self.is_likely_utf16le():

83 return "utf-16le"

84 # default to something valid

85 return "utf-16"

87 @property

88 def language(self) -> str:

89 return ""

91 def approx_32bit_chars(self) -> float:

92 return max(1.0, self.position / 4.0)

94 def approx_16bit_chars(self) -> float:

95 return max(1.0, self.position / 2.0)

97 def is_likely_utf32be(self) -> bool:

98 approx_chars = self.approx_32bit_chars()

99 if approx_chars < self.MIN_CHARS_FOR_DETECTION:

100 return False

101

102 # For UTF-32BE: first 3 bytes (0,1,2) often zero, last byte (3) non-zero

103 zero_012_ratio = (

104 self.zeros_at_mod[0] + self.zeros_at_mod[1] + self.zeros_at_mod[2]

105 ) / (approx_chars * 3)

106 nonzero_3_ratio = self.nonzeros_at_mod[3] / approx_chars

107

108 return (

109 zero_012_ratio > self.MIN_RATIO * 3

110 and nonzero_3_ratio > self.EXPECTED_RATIO

111 and not self.invalid_utf32be

112 )

113

114 def is_likely_utf32le(self) -> bool:

115 approx_chars = self.approx_32bit_chars()

116 if approx_chars < self.MIN_CHARS_FOR_DETECTION:

117 return False

118

119 # For UTF-32LE: first byte (0) non-zero, bytes 1,2,3 often zero

120 nonzero_0_ratio = self.nonzeros_at_mod[0] / approx_chars

121 zero_123_ratio = (

122 self.zeros_at_mod[1] + self.zeros_at_mod[2] + self.zeros_at_mod[3]

123 ) / (approx_chars * 3)

124

125 return (

126 nonzero_0_ratio > self.EXPECTED_RATIO

127 and zero_123_ratio > self.MIN_RATIO * 3

128 and not self.invalid_utf32le

129 )

130

131 def is_likely_utf16be(self) -> bool:

132 approx_chars = self.approx_16bit_chars()

133 if approx_chars < self.MIN_CHARS_FOR_DETECTION:

134 return False

135

136 nonzero_ratio = (

137 self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]

138 ) / approx_chars

139 zero_ratio = (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars

140

141 # For UTF-16BE, odd positions should be non-zero, even positions should have zeros

142 return (

143 nonzero_ratio > self.EXPECTED_RATIO

144 and zero_ratio > self.MIN_RATIO

145 and not self.invalid_utf16be

146 )

147

148 def is_likely_utf16le(self) -> bool:

149 approx_chars = self.approx_16bit_chars()

150 if approx_chars < self.MIN_CHARS_FOR_DETECTION:

151 return False

152

153 nonzero_ratio = (

154 self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]

155 ) / approx_chars

156 zero_ratio = (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars

157

158 # For UTF-16LE, even positions should be non-zero, odd positions should have zeros

159 # ASCII-heavy: both ratios > 94%

160 # CJK-heavy: nonzero_ratio > 94%, zero_ratio might be low but > 20%

161 return (

162 nonzero_ratio > self.EXPECTED_RATIO

163 and zero_ratio > self.MIN_RATIO

164 and not self.invalid_utf16le

165 )

166

167 def validate_utf32_characters(self, quad: list[int]) -> None:

168 """

169 Validate if the quad of bytes is valid UTF-32.

170

171 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF

172 excluding 0x0000D800 - 0x0000DFFF

173

174 https://en.wikipedia.org/wiki/UTF-32

175 """

176 if (

177 quad[0] != 0

178 or quad[1] > 0x10

179 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)

180 ):

181 self.invalid_utf32be = True

182 if (

183 quad[3] != 0

184 or quad[2] > 0x10

185 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)

186 ):

187 self.invalid_utf32le = True

188

189 def validate_utf16_characters(self, pair: list[int]) -> None:

190 """

191 Validate if the pair of bytes is valid UTF-16.

192

193 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF

194 with an exception for surrogate pairs, which must be in the range

195 0xD800-0xDBFF followed by 0xDC00-0xDFFF

196

197 https://en.wikipedia.org/wiki/UTF-16

198 """

199 if not self.first_half_surrogate_pair_detected_16be:

200 if 0xD8 <= pair[0] <= 0xDB:

201 self.first_half_surrogate_pair_detected_16be = True

202 elif 0xDC <= pair[0] <= 0xDF:

203 self.invalid_utf16be = True

204 else:

205 if 0xDC <= pair[0] <= 0xDF:

206 self.first_half_surrogate_pair_detected_16be = False

207 else:

208 self.invalid_utf16be = True

209

210 if not self.first_half_surrogate_pair_detected_16le:

211 if 0xD8 <= pair[1] <= 0xDB:

212 self.first_half_surrogate_pair_detected_16le = True

213 elif 0xDC <= pair[1] <= 0xDF:

214 self.invalid_utf16le = True

215 else:

216 if 0xDC <= pair[1] <= 0xDF:

217 self.first_half_surrogate_pair_detected_16le = False

218 else:

219 self.invalid_utf16le = True

220

221 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

222 for c in byte_str:

223 mod4 = self.position % 4

224 self.quad[mod4] = c

225 if mod4 == 3:

226 self.validate_utf32_characters(self.quad)

227 self.validate_utf16_characters(self.quad[0:2])

228 self.validate_utf16_characters(self.quad[2:4])

229 if c == 0:

230 self.zeros_at_mod[mod4] += 1

231 else:

232 self.nonzeros_at_mod[mod4] += 1

233 self.position += 1

234 return self.state

235

236 @property

237 def state(self) -> ProbingState:

238 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:

239 # terminal, decided states

240 return self._state

241 if self.get_confidence() > 0.80:

242 self._state = ProbingState.FOUND_IT

243 elif self.position > 4 * 1024:

244 # if we get to 4kb into the file, and we can't conclude it's UTF,

245 # let's give up

246 self._state = ProbingState.NOT_ME

247 return self._state

248

249 def get_confidence(self) -> float:

250 return (

251 0.85

252 if (

253 self.is_likely_utf16le()

254 or self.is_likely_utf16be()

255 or self.is_likely_utf32le()

256 or self.is_likely_utf32be()

257 )

258 else 0.00

259 )