Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/utf1632prober.py: 97%

1######################## BEGIN LICENSE BLOCK ########################

3# Contributor(s):

4# Jason Zavaglia

6# This library is free software; you can redistribute it and/or

7# modify it under the terms of the GNU Lesser General Public

8# License as published by the Free Software Foundation; either

9# version 2.1 of the License, or (at your option) any later version.

10#

11# This library is distributed in the hope that it will be useful,

12# but WITHOUT ANY WARRANTY; without even the implied warranty of

13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14# Lesser General Public License for more details.

15#

16# You should have received a copy of the GNU Lesser General Public

17# License along with this library; if not, see

18# <https://www.gnu.org/licenses/>.

19######################### END LICENSE BLOCK #########################

20from typing import List, Union

22from .charsetprober import CharSetProber

23from .enums import ProbingState

26class UTF1632Prober(CharSetProber):

27 """

28 This class simply looks for occurrences of zero bytes, and infers

29 whether the file is UTF16 or UTF32 (low-endian or big-endian)

30 For instance, files looking like ( \0 \0 \0 [nonzero] )+

31 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+

32 may be guessed to be UTF16BE, and inversely for little-endian varieties.

33 """

35 # how many logical characters to scan before feeling confident of prediction

36 MIN_CHARS_FOR_DETECTION = 20

37 # a fixed constant ratio of expected zeros or non-zeros in modulo-position.

38 EXPECTED_RATIO = 0.94

40 def __init__(self) -> None:

41 super().__init__()

42 self.position = 0

43 self.zeros_at_mod = [0] * 4

44 self.nonzeros_at_mod = [0] * 4

45 self._state = ProbingState.DETECTING

46 self.quad = [0, 0, 0, 0]

47 self.invalid_utf16be = False

48 self.invalid_utf16le = False

49 self.invalid_utf32be = False

50 self.invalid_utf32le = False

51 self.first_half_surrogate_pair_detected_16be = False

52 self.first_half_surrogate_pair_detected_16le = False

53 self.reset()

55 def reset(self) -> None:

56 super().reset()

57 self.position = 0

58 self.zeros_at_mod = [0] * 4

59 self.nonzeros_at_mod = [0] * 4

60 self._state = ProbingState.DETECTING

61 self.invalid_utf16be = False

62 self.invalid_utf16le = False

63 self.invalid_utf32be = False

64 self.invalid_utf32le = False

65 self.first_half_surrogate_pair_detected_16be = False

66 self.first_half_surrogate_pair_detected_16le = False

67 self.quad = [0, 0, 0, 0]

69 @property

70 def charset_name(self) -> str:

71 if self.is_likely_utf32be():

72 return "utf-32be"

73 if self.is_likely_utf32le():

74 return "utf-32le"

75 if self.is_likely_utf16be():

76 return "utf-16be"

77 if self.is_likely_utf16le():

78 return "utf-16le"

79 # default to something valid

80 return "utf-16"

82 @property

83 def language(self) -> str:

84 return ""

86 def approx_32bit_chars(self) -> float:

87 return max(1.0, self.position / 4.0)

89 def approx_16bit_chars(self) -> float:

90 return max(1.0, self.position / 2.0)

92 def is_likely_utf32be(self) -> bool:

93 approx_chars = self.approx_32bit_chars()

94 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

95 self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO

96 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO

97 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO

98 and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO

99 and not self.invalid_utf32be

100 )

101

102 def is_likely_utf32le(self) -> bool:

103 approx_chars = self.approx_32bit_chars()

104 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

105 self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO

106 and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO

107 and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO

108 and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO

109 and not self.invalid_utf32le

110 )

111

112 def is_likely_utf16be(self) -> bool:

113 approx_chars = self.approx_16bit_chars()

114 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

115 (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars

116 > self.EXPECTED_RATIO

117 and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars

118 > self.EXPECTED_RATIO

119 and not self.invalid_utf16be

120 )

121

122 def is_likely_utf16le(self) -> bool:

123 approx_chars = self.approx_16bit_chars()

124 return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (

125 (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars

126 > self.EXPECTED_RATIO

127 and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars

128 > self.EXPECTED_RATIO

129 and not self.invalid_utf16le

130 )

131

132 def validate_utf32_characters(self, quad: List[int]) -> None:

133 """

134 Validate if the quad of bytes is valid UTF-32.

135

136 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF

137 excluding 0x0000D800 - 0x0000DFFF

138

139 https://en.wikipedia.org/wiki/UTF-32

140 """

141 if (

142 quad[0] != 0

143 or quad[1] > 0x10

144 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)

145 ):

146 self.invalid_utf32be = True

147 if (

148 quad[3] != 0

149 or quad[2] > 0x10

150 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)

151 ):

152 self.invalid_utf32le = True

153

154 def validate_utf16_characters(self, pair: List[int]) -> None:

155 """

156 Validate if the pair of bytes is valid UTF-16.

157

158 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF

159 with an exception for surrogate pairs, which must be in the range

160 0xD800-0xDBFF followed by 0xDC00-0xDFFF

161

162 https://en.wikipedia.org/wiki/UTF-16

163 """

164 if not self.first_half_surrogate_pair_detected_16be:

165 if 0xD8 <= pair[0] <= 0xDB:

166 self.first_half_surrogate_pair_detected_16be = True

167 elif 0xDC <= pair[0] <= 0xDF:

168 self.invalid_utf16be = True

169 else:

170 if 0xDC <= pair[0] <= 0xDF:

171 self.first_half_surrogate_pair_detected_16be = False

172 else:

173 self.invalid_utf16be = True

174

175 if not self.first_half_surrogate_pair_detected_16le:

176 if 0xD8 <= pair[1] <= 0xDB:

177 self.first_half_surrogate_pair_detected_16le = True

178 elif 0xDC <= pair[1] <= 0xDF:

179 self.invalid_utf16le = True

180 else:

181 if 0xDC <= pair[1] <= 0xDF:

182 self.first_half_surrogate_pair_detected_16le = False

183 else:

184 self.invalid_utf16le = True

185

186 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

187 for c in byte_str:

188 mod4 = self.position % 4

189 self.quad[mod4] = c

190 if mod4 == 3:

191 self.validate_utf32_characters(self.quad)

192 self.validate_utf16_characters(self.quad[0:2])

193 self.validate_utf16_characters(self.quad[2:4])

194 if c == 0:

195 self.zeros_at_mod[mod4] += 1

196 else:

197 self.nonzeros_at_mod[mod4] += 1

198 self.position += 1

199 return self.state

200

201 @property

202 def state(self) -> ProbingState:

203 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:

204 # terminal, decided states

205 return self._state

206 if self.get_confidence() > 0.80:

207 self._state = ProbingState.FOUND_IT

208 elif self.position > 4 * 1024:

209 # if we get to 4kb into the file, and we can't conclude it's UTF,

210 # let's give up

211 self._state = ProbingState.NOT_ME

212 return self._state

213

214 def get_confidence(self) -> float:

215 return (

216 0.85

217 if (

218 self.is_likely_utf16le()

219 or self.is_likely_utf16be()

220 or self.is_likely_utf32le()

221 or self.is_likely_utf32be()

222 )

223 else 0.00

224 )