1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, see
25# <https://www.gnu.org/licenses/>.
26######################### END LICENSE BLOCK #########################
27"""
28Module containing the UniversalDetector detector class, which is the primary
29class a user of ``chardet`` should use.
30
31:author: Mark Pilgrim (initial port to Python)
32:author: Shy Shalom (original C code)
33:author: Dan Blanchard (major refactoring for 3.0)
34:author: Ian Cordasco
35"""
36
37import codecs
38import logging
39import re
40from typing import List, Optional, Union
41
42from .charsetgroupprober import CharSetGroupProber
43from .charsetprober import CharSetProber
44from .enums import InputState, LanguageFilter, ProbingState
45from .escprober import EscCharSetProber
46from .latin1prober import Latin1Prober
47from .macromanprober import MacRomanProber
48from .mbcsgroupprober import MBCSGroupProber
49from .resultdict import ResultDict
50from .sbcsgroupprober import SBCSGroupProber
51from .utf1632prober import UTF1632Prober
52
53
54class UniversalDetector:
55 """
56 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
57 and coordinates all of the different charset probers.
58
59 To get a ``dict`` containing an encoding and its confidence, you can simply
60 run:
61
62 .. code::
63
64 u = UniversalDetector()
65 u.feed(some_bytes)
66 u.close()
67 detected = u.result
68
69 """
70
71 MINIMUM_THRESHOLD = 0.20
72 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]")
73 ESC_DETECTOR = re.compile(b"(\033|~{)")
74 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")
75 ISO_WIN_MAP = {
76 "iso-8859-1": "Windows-1252",
77 "iso-8859-2": "Windows-1250",
78 "iso-8859-5": "Windows-1251",
79 "iso-8859-6": "Windows-1256",
80 "iso-8859-7": "Windows-1253",
81 "iso-8859-8": "Windows-1255",
82 "iso-8859-9": "Windows-1254",
83 "iso-8859-13": "Windows-1257",
84 }
85 # Based on https://encoding.spec.whatwg.org/#names-and-labels
86 # but altered to match Python names for encodings and remove mappings
87 # that break tests.
88 LEGACY_MAP = {
89 "ascii": "Windows-1252",
90 "iso-8859-1": "Windows-1252",
91 "tis-620": "ISO-8859-11",
92 "iso-8859-9": "Windows-1254",
93 "gb2312": "GB18030",
94 "euc-kr": "CP949",
95 "utf-16le": "UTF-16",
96 }
97
98 def __init__(
99 self,
100 lang_filter: LanguageFilter = LanguageFilter.ALL,
101 should_rename_legacy: bool = False,
102 ) -> None:
103 self._esc_charset_prober: Optional[EscCharSetProber] = None
104 self._utf1632_prober: Optional[UTF1632Prober] = None
105 self._charset_probers: List[CharSetProber] = []
106 self.result: ResultDict = {
107 "encoding": None,
108 "confidence": 0.0,
109 "language": None,
110 }
111 self.done = False
112 self._got_data = False
113 self._input_state = InputState.PURE_ASCII
114 self._last_char = b""
115 self.lang_filter = lang_filter
116 self.logger = logging.getLogger(__name__)
117 self._has_win_bytes = False
118 self.should_rename_legacy = should_rename_legacy
119 self.reset()
120
121 @property
122 def input_state(self) -> int:
123 return self._input_state
124
125 @property
126 def has_win_bytes(self) -> bool:
127 return self._has_win_bytes
128
129 @property
130 def charset_probers(self) -> List[CharSetProber]:
131 return self._charset_probers
132
133 def reset(self) -> None:
134 """
135 Reset the UniversalDetector and all of its probers back to their
136 initial states. This is called by ``__init__``, so you only need to
137 call this directly in between analyses of different documents.
138 """
139 self.result = {"encoding": None, "confidence": 0.0, "language": None}
140 self.done = False
141 self._got_data = False
142 self._has_win_bytes = False
143 self._input_state = InputState.PURE_ASCII
144 self._last_char = b""
145 if self._esc_charset_prober:
146 self._esc_charset_prober.reset()
147 if self._utf1632_prober:
148 self._utf1632_prober.reset()
149 for prober in self._charset_probers:
150 prober.reset()
151
152 def feed(self, byte_str: Union[bytes, bytearray]) -> None:
153 """
154 Takes a chunk of a document and feeds it through all of the relevant
155 charset probers.
156
157 After calling ``feed``, you can check the value of the ``done``
158 attribute to see if you need to continue feeding the
159 ``UniversalDetector`` more data, or if it has made a prediction
160 (in the ``result`` attribute).
161
162 .. note::
163 You should always call ``close`` when you're done feeding in your
164 document if ``done`` is not already ``True``.
165 """
166 if self.done:
167 return
168
169 if not byte_str:
170 return
171
172 if not isinstance(byte_str, bytearray):
173 byte_str = bytearray(byte_str)
174
175 # First check for known BOMs, since these are guaranteed to be correct
176 if not self._got_data:
177 # If the data starts with BOM, we know it is UTF
178 if byte_str.startswith(codecs.BOM_UTF8):
179 # EF BB BF UTF-8 with BOM
180 self.result = {
181 "encoding": "UTF-8-SIG",
182 "confidence": 1.0,
183 "language": "",
184 }
185 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
186 # FF FE 00 00 UTF-32, little-endian BOM
187 # 00 00 FE FF UTF-32, big-endian BOM
188 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
189 elif byte_str.startswith(b"\xfe\xff\x00\x00"):
190 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
191 self.result = {
192 # TODO: This encoding is not supported by Python. Should remove?
193 "encoding": "X-ISO-10646-UCS-4-3412",
194 "confidence": 1.0,
195 "language": "",
196 }
197 elif byte_str.startswith(b"\x00\x00\xff\xfe"):
198 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
199 self.result = {
200 # TODO: This encoding is not supported by Python. Should remove?
201 "encoding": "X-ISO-10646-UCS-4-2143",
202 "confidence": 1.0,
203 "language": "",
204 }
205 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
206 # FF FE UTF-16, little endian BOM
207 # FE FF UTF-16, big endian BOM
208 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
209
210 self._got_data = True
211 if self.result["encoding"] is not None:
212 self.done = True
213 return
214
215 # If none of those matched and we've only see ASCII so far, check
216 # for high bytes and escape sequences
217 if self._input_state == InputState.PURE_ASCII:
218 if self.HIGH_BYTE_DETECTOR.search(byte_str):
219 self._input_state = InputState.HIGH_BYTE
220 elif (
221 self._input_state == InputState.PURE_ASCII
222 and self.ESC_DETECTOR.search(self._last_char + byte_str)
223 ):
224 self._input_state = InputState.ESC_ASCII
225
226 self._last_char = byte_str[-1:]
227
228 # next we will look to see if it is appears to be either a UTF-16 or
229 # UTF-32 encoding
230 if not self._utf1632_prober:
231 self._utf1632_prober = UTF1632Prober()
232
233 if self._utf1632_prober.state == ProbingState.DETECTING:
234 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
235 self.result = {
236 "encoding": self._utf1632_prober.charset_name,
237 "confidence": self._utf1632_prober.get_confidence(),
238 "language": "",
239 }
240 self.done = True
241 return
242
243 # If we've seen escape sequences, use the EscCharSetProber, which
244 # uses a simple state machine to check for known escape sequences in
245 # HZ and ISO-2022 encodings, since those are the only encodings that
246 # use such sequences.
247 if self._input_state == InputState.ESC_ASCII:
248 if not self._esc_charset_prober:
249 self._esc_charset_prober = EscCharSetProber(self.lang_filter)
250 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
251 self.result = {
252 "encoding": self._esc_charset_prober.charset_name,
253 "confidence": self._esc_charset_prober.get_confidence(),
254 "language": self._esc_charset_prober.language,
255 }
256 self.done = True
257 # If we've seen high bytes (i.e., those with values greater than 127),
258 # we need to do more complicated checks using all our multi-byte and
259 # single-byte probers that are left. The single-byte probers
260 # use character bigram distributions to determine the encoding, whereas
261 # the multi-byte probers use a combination of character unigram and
262 # bigram distributions.
263 elif self._input_state == InputState.HIGH_BYTE:
264 if not self._charset_probers:
265 self._charset_probers = [MBCSGroupProber(self.lang_filter)]
266 # If we're checking non-CJK encodings, use single-byte prober
267 if self.lang_filter & LanguageFilter.NON_CJK:
268 self._charset_probers.append(SBCSGroupProber())
269 self._charset_probers.append(Latin1Prober())
270 self._charset_probers.append(MacRomanProber())
271 for prober in self._charset_probers:
272 if prober.feed(byte_str) == ProbingState.FOUND_IT:
273 self.result = {
274 "encoding": prober.charset_name,
275 "confidence": prober.get_confidence(),
276 "language": prober.language,
277 }
278 self.done = True
279 break
280 if self.WIN_BYTE_DETECTOR.search(byte_str):
281 self._has_win_bytes = True
282
283 def close(self) -> ResultDict:
284 """
285 Stop analyzing the current document and come up with a final
286 prediction.
287
288 :returns: The ``result`` attribute, a ``dict`` with the keys
289 `encoding`, `confidence`, and `language`.
290 """
291 # Don't bother with checks if we're already done
292 if self.done:
293 return self.result
294 self.done = True
295
296 if not self._got_data:
297 self.logger.debug("no data received!")
298
299 # Default to ASCII if it is all we've seen so far
300 elif self._input_state == InputState.PURE_ASCII:
301 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
302
303 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
304 elif self._input_state == InputState.HIGH_BYTE:
305 prober_confidence = None
306 max_prober_confidence = 0.0
307 max_prober = None
308 for prober in self._charset_probers:
309 if not prober:
310 continue
311 prober_confidence = prober.get_confidence()
312 if prober_confidence > max_prober_confidence:
313 max_prober_confidence = prober_confidence
314 max_prober = prober
315 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
316 charset_name = max_prober.charset_name
317 assert charset_name is not None
318 lower_charset_name = charset_name.lower()
319 confidence = max_prober.get_confidence()
320 # Use Windows encoding name instead of ISO-8859 if we saw any
321 # extra Windows-specific bytes
322 if lower_charset_name.startswith("iso-8859"):
323 if self._has_win_bytes:
324 charset_name = self.ISO_WIN_MAP.get(
325 lower_charset_name, charset_name
326 )
327 # Rename legacy encodings with superset encodings if asked
328 if self.should_rename_legacy:
329 charset_name = self.LEGACY_MAP.get(
330 (charset_name or "").lower(), charset_name
331 )
332 self.result = {
333 "encoding": charset_name,
334 "confidence": confidence,
335 "language": max_prober.language,
336 }
337
338 # Log all prober confidences if none met MINIMUM_THRESHOLD
339 if self.logger.getEffectiveLevel() <= logging.DEBUG:
340 if self.result["encoding"] is None:
341 self.logger.debug("no probers hit minimum threshold")
342 for group_prober in self._charset_probers:
343 if not group_prober:
344 continue
345 if isinstance(group_prober, CharSetGroupProber):
346 for prober in group_prober.probers:
347 self.logger.debug(
348 "%s %s confidence = %s",
349 prober.charset_name,
350 prober.language,
351 prober.get_confidence(),
352 )
353 else:
354 self.logger.debug(
355 "%s %s confidence = %s",
356 group_prober.charset_name,
357 group_prober.language,
358 group_prober.get_confidence(),
359 )
360 return self.result