1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26# 02110-1301 USA
27######################### END LICENSE BLOCK #########################
28"""
29Module containing the UniversalDetector detector class, which is the primary
30class a user of ``chardet`` should use.
31
32:author: Mark Pilgrim (initial port to Python)
33:author: Shy Shalom (original C code)
34:author: Dan Blanchard (major refactoring for 3.0)
35:author: Ian Cordasco
36"""
37
38import codecs
39import logging
40import re
41from typing import List, Optional, Union
42
43from .charsetgroupprober import CharSetGroupProber
44from .charsetprober import CharSetProber
45from .enums import InputState, LanguageFilter, ProbingState
46from .escprober import EscCharSetProber
47from .latin1prober import Latin1Prober
48from .macromanprober import MacRomanProber
49from .mbcsgroupprober import MBCSGroupProber
50from .resultdict import ResultDict
51from .sbcsgroupprober import SBCSGroupProber
52from .utf1632prober import UTF1632Prober
53
54
55class UniversalDetector:
56 """
57 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
58 and coordinates all of the different charset probers.
59
60 To get a ``dict`` containing an encoding and its confidence, you can simply
61 run:
62
63 .. code::
64
65 u = UniversalDetector()
66 u.feed(some_bytes)
67 u.close()
68 detected = u.result
69
70 """
71
72 MINIMUM_THRESHOLD = 0.20
73 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]")
74 ESC_DETECTOR = re.compile(b"(\033|~{)")
75 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")
76 ISO_WIN_MAP = {
77 "iso-8859-1": "Windows-1252",
78 "iso-8859-2": "Windows-1250",
79 "iso-8859-5": "Windows-1251",
80 "iso-8859-6": "Windows-1256",
81 "iso-8859-7": "Windows-1253",
82 "iso-8859-8": "Windows-1255",
83 "iso-8859-9": "Windows-1254",
84 "iso-8859-13": "Windows-1257",
85 }
86 # Based on https://encoding.spec.whatwg.org/#names-and-labels
87 # but altered to match Python names for encodings and remove mappings
88 # that break tests.
89 LEGACY_MAP = {
90 "ascii": "Windows-1252",
91 "iso-8859-1": "Windows-1252",
92 "tis-620": "ISO-8859-11",
93 "iso-8859-9": "Windows-1254",
94 "gb2312": "GB18030",
95 "euc-kr": "CP949",
96 "utf-16le": "UTF-16",
97 }
98
99 def __init__(
100 self,
101 lang_filter: LanguageFilter = LanguageFilter.ALL,
102 should_rename_legacy: bool = False,
103 ) -> None:
104 self._esc_charset_prober: Optional[EscCharSetProber] = None
105 self._utf1632_prober: Optional[UTF1632Prober] = None
106 self._charset_probers: List[CharSetProber] = []
107 self.result: ResultDict = {
108 "encoding": None,
109 "confidence": 0.0,
110 "language": None,
111 }
112 self.done = False
113 self._got_data = False
114 self._input_state = InputState.PURE_ASCII
115 self._last_char = b""
116 self.lang_filter = lang_filter
117 self.logger = logging.getLogger(__name__)
118 self._has_win_bytes = False
119 self.should_rename_legacy = should_rename_legacy
120 self.reset()
121
122 @property
123 def input_state(self) -> int:
124 return self._input_state
125
126 @property
127 def has_win_bytes(self) -> bool:
128 return self._has_win_bytes
129
130 @property
131 def charset_probers(self) -> List[CharSetProber]:
132 return self._charset_probers
133
134 def reset(self) -> None:
135 """
136 Reset the UniversalDetector and all of its probers back to their
137 initial states. This is called by ``__init__``, so you only need to
138 call this directly in between analyses of different documents.
139 """
140 self.result = {"encoding": None, "confidence": 0.0, "language": None}
141 self.done = False
142 self._got_data = False
143 self._has_win_bytes = False
144 self._input_state = InputState.PURE_ASCII
145 self._last_char = b""
146 if self._esc_charset_prober:
147 self._esc_charset_prober.reset()
148 if self._utf1632_prober:
149 self._utf1632_prober.reset()
150 for prober in self._charset_probers:
151 prober.reset()
152
153 def feed(self, byte_str: Union[bytes, bytearray]) -> None:
154 """
155 Takes a chunk of a document and feeds it through all of the relevant
156 charset probers.
157
158 After calling ``feed``, you can check the value of the ``done``
159 attribute to see if you need to continue feeding the
160 ``UniversalDetector`` more data, or if it has made a prediction
161 (in the ``result`` attribute).
162
163 .. note::
164 You should always call ``close`` when you're done feeding in your
165 document if ``done`` is not already ``True``.
166 """
167 if self.done:
168 return
169
170 if not byte_str:
171 return
172
173 if not isinstance(byte_str, bytearray):
174 byte_str = bytearray(byte_str)
175
176 # First check for known BOMs, since these are guaranteed to be correct
177 if not self._got_data:
178 # If the data starts with BOM, we know it is UTF
179 if byte_str.startswith(codecs.BOM_UTF8):
180 # EF BB BF UTF-8 with BOM
181 self.result = {
182 "encoding": "UTF-8-SIG",
183 "confidence": 1.0,
184 "language": "",
185 }
186 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
187 # FF FE 00 00 UTF-32, little-endian BOM
188 # 00 00 FE FF UTF-32, big-endian BOM
189 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
190 elif byte_str.startswith(b"\xfe\xff\x00\x00"):
191 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
192 self.result = {
193 # TODO: This encoding is not supported by Python. Should remove?
194 "encoding": "X-ISO-10646-UCS-4-3412",
195 "confidence": 1.0,
196 "language": "",
197 }
198 elif byte_str.startswith(b"\x00\x00\xff\xfe"):
199 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
200 self.result = {
201 # TODO: This encoding is not supported by Python. Should remove?
202 "encoding": "X-ISO-10646-UCS-4-2143",
203 "confidence": 1.0,
204 "language": "",
205 }
206 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
207 # FF FE UTF-16, little endian BOM
208 # FE FF UTF-16, big endian BOM
209 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
210
211 self._got_data = True
212 if self.result["encoding"] is not None:
213 self.done = True
214 return
215
216 # If none of those matched and we've only see ASCII so far, check
217 # for high bytes and escape sequences
218 if self._input_state == InputState.PURE_ASCII:
219 if self.HIGH_BYTE_DETECTOR.search(byte_str):
220 self._input_state = InputState.HIGH_BYTE
221 elif (
222 self._input_state == InputState.PURE_ASCII
223 and self.ESC_DETECTOR.search(self._last_char + byte_str)
224 ):
225 self._input_state = InputState.ESC_ASCII
226
227 self._last_char = byte_str[-1:]
228
229 # next we will look to see if it is appears to be either a UTF-16 or
230 # UTF-32 encoding
231 if not self._utf1632_prober:
232 self._utf1632_prober = UTF1632Prober()
233
234 if self._utf1632_prober.state == ProbingState.DETECTING:
235 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
236 self.result = {
237 "encoding": self._utf1632_prober.charset_name,
238 "confidence": self._utf1632_prober.get_confidence(),
239 "language": "",
240 }
241 self.done = True
242 return
243
244 # If we've seen escape sequences, use the EscCharSetProber, which
245 # uses a simple state machine to check for known escape sequences in
246 # HZ and ISO-2022 encodings, since those are the only encodings that
247 # use such sequences.
248 if self._input_state == InputState.ESC_ASCII:
249 if not self._esc_charset_prober:
250 self._esc_charset_prober = EscCharSetProber(self.lang_filter)
251 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
252 self.result = {
253 "encoding": self._esc_charset_prober.charset_name,
254 "confidence": self._esc_charset_prober.get_confidence(),
255 "language": self._esc_charset_prober.language,
256 }
257 self.done = True
258 # If we've seen high bytes (i.e., those with values greater than 127),
259 # we need to do more complicated checks using all our multi-byte and
260 # single-byte probers that are left. The single-byte probers
261 # use character bigram distributions to determine the encoding, whereas
262 # the multi-byte probers use a combination of character unigram and
263 # bigram distributions.
264 elif self._input_state == InputState.HIGH_BYTE:
265 if not self._charset_probers:
266 self._charset_probers = [MBCSGroupProber(self.lang_filter)]
267 # If we're checking non-CJK encodings, use single-byte prober
268 if self.lang_filter & LanguageFilter.NON_CJK:
269 self._charset_probers.append(SBCSGroupProber())
270 self._charset_probers.append(Latin1Prober())
271 self._charset_probers.append(MacRomanProber())
272 for prober in self._charset_probers:
273 if prober.feed(byte_str) == ProbingState.FOUND_IT:
274 self.result = {
275 "encoding": prober.charset_name,
276 "confidence": prober.get_confidence(),
277 "language": prober.language,
278 }
279 self.done = True
280 break
281 if self.WIN_BYTE_DETECTOR.search(byte_str):
282 self._has_win_bytes = True
283
284 def close(self) -> ResultDict:
285 """
286 Stop analyzing the current document and come up with a final
287 prediction.
288
289 :returns: The ``result`` attribute, a ``dict`` with the keys
290 `encoding`, `confidence`, and `language`.
291 """
292 # Don't bother with checks if we're already done
293 if self.done:
294 return self.result
295 self.done = True
296
297 if not self._got_data:
298 self.logger.debug("no data received!")
299
300 # Default to ASCII if it is all we've seen so far
301 elif self._input_state == InputState.PURE_ASCII:
302 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
303
304 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
305 elif self._input_state == InputState.HIGH_BYTE:
306 prober_confidence = None
307 max_prober_confidence = 0.0
308 max_prober = None
309 for prober in self._charset_probers:
310 if not prober:
311 continue
312 prober_confidence = prober.get_confidence()
313 if prober_confidence > max_prober_confidence:
314 max_prober_confidence = prober_confidence
315 max_prober = prober
316 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
317 charset_name = max_prober.charset_name
318 assert charset_name is not None
319 lower_charset_name = charset_name.lower()
320 confidence = max_prober.get_confidence()
321 # Use Windows encoding name instead of ISO-8859 if we saw any
322 # extra Windows-specific bytes
323 if lower_charset_name.startswith("iso-8859"):
324 if self._has_win_bytes:
325 charset_name = self.ISO_WIN_MAP.get(
326 lower_charset_name, charset_name
327 )
328 # Rename legacy encodings with superset encodings if asked
329 if self.should_rename_legacy:
330 charset_name = self.LEGACY_MAP.get(
331 (charset_name or "").lower(), charset_name
332 )
333 self.result = {
334 "encoding": charset_name,
335 "confidence": confidence,
336 "language": max_prober.language,
337 }
338
339 # Log all prober confidences if none met MINIMUM_THRESHOLD
340 if self.logger.getEffectiveLevel() <= logging.DEBUG:
341 if self.result["encoding"] is None:
342 self.logger.debug("no probers hit minimum threshold")
343 for group_prober in self._charset_probers:
344 if not group_prober:
345 continue
346 if isinstance(group_prober, CharSetGroupProber):
347 for prober in group_prober.probers:
348 self.logger.debug(
349 "%s %s confidence = %s",
350 prober.charset_name,
351 prober.language,
352 prober.get_confidence(),
353 )
354 else:
355 self.logger.debug(
356 "%s %s confidence = %s",
357 group_prober.charset_name,
358 group_prober.language,
359 group_prober.get_confidence(),
360 )
361 return self.result