1######################## BEGIN LICENSE BLOCK ########################
2#
3# Contributor(s):
4# Jason Zavaglia
5#
6# This library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# This library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with this library; if not, see
18# <https://www.gnu.org/licenses/>.
19######################### END LICENSE BLOCK #########################
20from typing import Union
21
22from .charsetprober import CharSetProber
23from .enums import ProbingState
24
25
26class UTF1632Prober(CharSetProber):
27 """
28 This class simply looks for occurrences of zero bytes, and infers
29 whether the file is UTF16 or UTF32 (low-endian or big-endian)
30 For instance, files looking like ( \0 \0 \0 [nonzero] )+
31 have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
32 may be guessed to be UTF16BE, and inversely for little-endian varieties.
33 """
34
35 # how many logical characters to scan before feeling confident of prediction
36 MIN_CHARS_FOR_DETECTION = 20
37 # a fixed constant ratio of expected zeros or non-zeros in modulo-position.
38 # For ASCII-heavy text in UTF-16/32
39 EXPECTED_RATIO = 0.94
40 # Minimum ratio for non-ASCII text (e.g., CJK characters in UTF-16)
41 # CJK text has fewer null bytes but still shows clear alternating patterns
42 # compared to random binary data
43 MIN_RATIO = 0.08
44
45 def __init__(self) -> None:
46 super().__init__()
47 self.position = 0
48 self.zeros_at_mod = [0] * 4
49 self.nonzeros_at_mod = [0] * 4
50 self._state = ProbingState.DETECTING
51 self.quad = [0, 0, 0, 0]
52 self.invalid_utf16be = False
53 self.invalid_utf16le = False
54 self.invalid_utf32be = False
55 self.invalid_utf32le = False
56 self.first_half_surrogate_pair_detected_16be = False
57 self.first_half_surrogate_pair_detected_16le = False
58 self.reset()
59
60 def reset(self) -> None:
61 super().reset()
62 self.position = 0
63 self.zeros_at_mod = [0] * 4
64 self.nonzeros_at_mod = [0] * 4
65 self._state = ProbingState.DETECTING
66 self.invalid_utf16be = False
67 self.invalid_utf16le = False
68 self.invalid_utf32be = False
69 self.invalid_utf32le = False
70 self.first_half_surrogate_pair_detected_16be = False
71 self.first_half_surrogate_pair_detected_16le = False
72 self.quad = [0, 0, 0, 0]
73
74 @property
75 def charset_name(self) -> str:
76 if self.is_likely_utf32be():
77 return "utf-32be"
78 if self.is_likely_utf32le():
79 return "utf-32le"
80 if self.is_likely_utf16be():
81 return "utf-16be"
82 if self.is_likely_utf16le():
83 return "utf-16le"
84 # default to something valid
85 return "utf-16"
86
87 @property
88 def language(self) -> str:
89 return ""
90
91 def approx_32bit_chars(self) -> float:
92 return max(1.0, self.position / 4.0)
93
94 def approx_16bit_chars(self) -> float:
95 return max(1.0, self.position / 2.0)
96
97 def is_likely_utf32be(self) -> bool:
98 approx_chars = self.approx_32bit_chars()
99 if approx_chars < self.MIN_CHARS_FOR_DETECTION:
100 return False
101
102 # For UTF-32BE: first 3 bytes (0,1,2) often zero, last byte (3) non-zero
103 zero_012_ratio = (
104 self.zeros_at_mod[0] + self.zeros_at_mod[1] + self.zeros_at_mod[2]
105 ) / (approx_chars * 3)
106 nonzero_3_ratio = self.nonzeros_at_mod[3] / approx_chars
107
108 return (
109 zero_012_ratio > self.MIN_RATIO * 3
110 and nonzero_3_ratio > self.EXPECTED_RATIO
111 and not self.invalid_utf32be
112 )
113
114 def is_likely_utf32le(self) -> bool:
115 approx_chars = self.approx_32bit_chars()
116 if approx_chars < self.MIN_CHARS_FOR_DETECTION:
117 return False
118
119 # For UTF-32LE: first byte (0) non-zero, bytes 1,2,3 often zero
120 nonzero_0_ratio = self.nonzeros_at_mod[0] / approx_chars
121 zero_123_ratio = (
122 self.zeros_at_mod[1] + self.zeros_at_mod[2] + self.zeros_at_mod[3]
123 ) / (approx_chars * 3)
124
125 return (
126 nonzero_0_ratio > self.EXPECTED_RATIO
127 and zero_123_ratio > self.MIN_RATIO * 3
128 and not self.invalid_utf32le
129 )
130
131 def is_likely_utf16be(self) -> bool:
132 approx_chars = self.approx_16bit_chars()
133 if approx_chars < self.MIN_CHARS_FOR_DETECTION:
134 return False
135
136 nonzero_ratio = (
137 self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]
138 ) / approx_chars
139 zero_ratio = (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
140
141 # For UTF-16BE, odd positions should be non-zero, even positions should have zeros
142 return (
143 nonzero_ratio > self.EXPECTED_RATIO
144 and zero_ratio > self.MIN_RATIO
145 and not self.invalid_utf16be
146 )
147
148 def is_likely_utf16le(self) -> bool:
149 approx_chars = self.approx_16bit_chars()
150 if approx_chars < self.MIN_CHARS_FOR_DETECTION:
151 return False
152
153 nonzero_ratio = (
154 self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]
155 ) / approx_chars
156 zero_ratio = (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
157
158 # For UTF-16LE, even positions should be non-zero, odd positions should have zeros
159 # ASCII-heavy: both ratios > 94%
160 # CJK-heavy: nonzero_ratio > 94%, zero_ratio might be low but > 20%
161 return (
162 nonzero_ratio > self.EXPECTED_RATIO
163 and zero_ratio > self.MIN_RATIO
164 and not self.invalid_utf16le
165 )
166
167 def validate_utf32_characters(self, quad: list[int]) -> None:
168 """
169 Validate if the quad of bytes is valid UTF-32.
170
171 UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
172 excluding 0x0000D800 - 0x0000DFFF
173
174 https://en.wikipedia.org/wiki/UTF-32
175 """
176 if (
177 quad[0] != 0
178 or quad[1] > 0x10
179 or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
180 ):
181 self.invalid_utf32be = True
182 if (
183 quad[3] != 0
184 or quad[2] > 0x10
185 or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
186 ):
187 self.invalid_utf32le = True
188
189 def validate_utf16_characters(self, pair: list[int]) -> None:
190 """
191 Validate if the pair of bytes is valid UTF-16.
192
193 UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
194 with an exception for surrogate pairs, which must be in the range
195 0xD800-0xDBFF followed by 0xDC00-0xDFFF
196
197 https://en.wikipedia.org/wiki/UTF-16
198 """
199 if not self.first_half_surrogate_pair_detected_16be:
200 if 0xD8 <= pair[0] <= 0xDB:
201 self.first_half_surrogate_pair_detected_16be = True
202 elif 0xDC <= pair[0] <= 0xDF:
203 self.invalid_utf16be = True
204 else:
205 if 0xDC <= pair[0] <= 0xDF:
206 self.first_half_surrogate_pair_detected_16be = False
207 else:
208 self.invalid_utf16be = True
209
210 if not self.first_half_surrogate_pair_detected_16le:
211 if 0xD8 <= pair[1] <= 0xDB:
212 self.first_half_surrogate_pair_detected_16le = True
213 elif 0xDC <= pair[1] <= 0xDF:
214 self.invalid_utf16le = True
215 else:
216 if 0xDC <= pair[1] <= 0xDF:
217 self.first_half_surrogate_pair_detected_16le = False
218 else:
219 self.invalid_utf16le = True
220
221 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
222 for c in byte_str:
223 mod4 = self.position % 4
224 self.quad[mod4] = c
225 if mod4 == 3:
226 self.validate_utf32_characters(self.quad)
227 self.validate_utf16_characters(self.quad[0:2])
228 self.validate_utf16_characters(self.quad[2:4])
229 if c == 0:
230 self.zeros_at_mod[mod4] += 1
231 else:
232 self.nonzeros_at_mod[mod4] += 1
233 self.position += 1
234 return self.state
235
236 @property
237 def state(self) -> ProbingState:
238 if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
239 # terminal, decided states
240 return self._state
241 if self.get_confidence() > 0.80:
242 self._state = ProbingState.FOUND_IT
243 elif self.position > 4 * 1024:
244 # if we get to 4kb into the file, and we can't conclude it's UTF,
245 # let's give up
246 self._state = ProbingState.NOT_ME
247 return self._state
248
249 def get_confidence(self) -> float:
250 return (
251 0.85
252 if (
253 self.is_likely_utf16le()
254 or self.is_likely_utf16be()
255 or self.is_likely_utf32le()
256 or self.is_likely_utf32be()
257 )
258 else 0.00
259 )