1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26# 02110-1301 USA
27######################### END LICENSE BLOCK #########################
28"""
29Module containing the UniversalDetector detector class, which is the primary
30class a user of ``chardet`` should use.
31
32:author: Mark Pilgrim (initial port to Python)
33:author: Shy Shalom (original C code)
34:author: Dan Blanchard (major refactoring for 3.0)
35:author: Ian Cordasco
36"""
37
38
39import codecs
40import logging
41import re
42from typing import List, Optional, Union
43
44from .charsetgroupprober import CharSetGroupProber
45from .charsetprober import CharSetProber
46from .enums import InputState, LanguageFilter, ProbingState
47from .escprober import EscCharSetProber
48from .latin1prober import Latin1Prober
49from .macromanprober import MacRomanProber
50from .mbcsgroupprober import MBCSGroupProber
51from .resultdict import ResultDict
52from .sbcsgroupprober import SBCSGroupProber
53from .utf1632prober import UTF1632Prober
54
55
56class UniversalDetector:
57 """
58 The ``UniversalDetector`` class underlies the ``chardet.detect`` function
59 and coordinates all of the different charset probers.
60
61 To get a ``dict`` containing an encoding and its confidence, you can simply
62 run:
63
64 .. code::
65
66 u = UniversalDetector()
67 u.feed(some_bytes)
68 u.close()
69 detected = u.result
70
71 """
72
73 MINIMUM_THRESHOLD = 0.20
74 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
75 ESC_DETECTOR = re.compile(b"(\033|~{)")
76 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
77 ISO_WIN_MAP = {
78 "iso-8859-1": "Windows-1252",
79 "iso-8859-2": "Windows-1250",
80 "iso-8859-5": "Windows-1251",
81 "iso-8859-6": "Windows-1256",
82 "iso-8859-7": "Windows-1253",
83 "iso-8859-8": "Windows-1255",
84 "iso-8859-9": "Windows-1254",
85 "iso-8859-13": "Windows-1257",
86 }
87 # Based on https://encoding.spec.whatwg.org/#names-and-labels
88 # but altered to match Python names for encodings and remove mappings
89 # that break tests.
90 LEGACY_MAP = {
91 "ascii": "Windows-1252",
92 "iso-8859-1": "Windows-1252",
93 "tis-620": "ISO-8859-11",
94 "iso-8859-9": "Windows-1254",
95 "gb2312": "GB18030",
96 "euc-kr": "CP949",
97 "utf-16le": "UTF-16",
98 }
99
100 def __init__(
101 self,
102 lang_filter: LanguageFilter = LanguageFilter.ALL,
103 should_rename_legacy: bool = False,
104 ) -> None:
105 self._esc_charset_prober: Optional[EscCharSetProber] = None
106 self._utf1632_prober: Optional[UTF1632Prober] = None
107 self._charset_probers: List[CharSetProber] = []
108 self.result: ResultDict = {
109 "encoding": None,
110 "confidence": 0.0,
111 "language": None,
112 }
113 self.done = False
114 self._got_data = False
115 self._input_state = InputState.PURE_ASCII
116 self._last_char = b""
117 self.lang_filter = lang_filter
118 self.logger = logging.getLogger(__name__)
119 self._has_win_bytes = False
120 self.should_rename_legacy = should_rename_legacy
121 self.reset()
122
123 @property
124 def input_state(self) -> int:
125 return self._input_state
126
127 @property
128 def has_win_bytes(self) -> bool:
129 return self._has_win_bytes
130
131 @property
132 def charset_probers(self) -> List[CharSetProber]:
133 return self._charset_probers
134
135 def reset(self) -> None:
136 """
137 Reset the UniversalDetector and all of its probers back to their
138 initial states. This is called by ``__init__``, so you only need to
139 call this directly in between analyses of different documents.
140 """
141 self.result = {"encoding": None, "confidence": 0.0, "language": None}
142 self.done = False
143 self._got_data = False
144 self._has_win_bytes = False
145 self._input_state = InputState.PURE_ASCII
146 self._last_char = b""
147 if self._esc_charset_prober:
148 self._esc_charset_prober.reset()
149 if self._utf1632_prober:
150 self._utf1632_prober.reset()
151 for prober in self._charset_probers:
152 prober.reset()
153
154 def feed(self, byte_str: Union[bytes, bytearray]) -> None:
155 """
156 Takes a chunk of a document and feeds it through all of the relevant
157 charset probers.
158
159 After calling ``feed``, you can check the value of the ``done``
160 attribute to see if you need to continue feeding the
161 ``UniversalDetector`` more data, or if it has made a prediction
162 (in the ``result`` attribute).
163
164 .. note::
165 You should always call ``close`` when you're done feeding in your
166 document if ``done`` is not already ``True``.
167 """
168 if self.done:
169 return
170
171 if not byte_str:
172 return
173
174 if not isinstance(byte_str, bytearray):
175 byte_str = bytearray(byte_str)
176
177 # First check for known BOMs, since these are guaranteed to be correct
178 if not self._got_data:
179 # If the data starts with BOM, we know it is UTF
180 if byte_str.startswith(codecs.BOM_UTF8):
181 # EF BB BF UTF-8 with BOM
182 self.result = {
183 "encoding": "UTF-8-SIG",
184 "confidence": 1.0,
185 "language": "",
186 }
187 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
188 # FF FE 00 00 UTF-32, little-endian BOM
189 # 00 00 FE FF UTF-32, big-endian BOM
190 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
191 elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
192 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
193 self.result = {
194 # TODO: This encoding is not supported by Python. Should remove?
195 "encoding": "X-ISO-10646-UCS-4-3412",
196 "confidence": 1.0,
197 "language": "",
198 }
199 elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
200 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
201 self.result = {
202 # TODO: This encoding is not supported by Python. Should remove?
203 "encoding": "X-ISO-10646-UCS-4-2143",
204 "confidence": 1.0,
205 "language": "",
206 }
207 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
208 # FF FE UTF-16, little endian BOM
209 # FE FF UTF-16, big endian BOM
210 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
211
212 self._got_data = True
213 if self.result["encoding"] is not None:
214 self.done = True
215 return
216
217 # If none of those matched and we've only see ASCII so far, check
218 # for high bytes and escape sequences
219 if self._input_state == InputState.PURE_ASCII:
220 if self.HIGH_BYTE_DETECTOR.search(byte_str):
221 self._input_state = InputState.HIGH_BYTE
222 elif (
223 self._input_state == InputState.PURE_ASCII
224 and self.ESC_DETECTOR.search(self._last_char + byte_str)
225 ):
226 self._input_state = InputState.ESC_ASCII
227
228 self._last_char = byte_str[-1:]
229
230 # next we will look to see if it is appears to be either a UTF-16 or
231 # UTF-32 encoding
232 if not self._utf1632_prober:
233 self._utf1632_prober = UTF1632Prober()
234
235 if self._utf1632_prober.state == ProbingState.DETECTING:
236 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
237 self.result = {
238 "encoding": self._utf1632_prober.charset_name,
239 "confidence": self._utf1632_prober.get_confidence(),
240 "language": "",
241 }
242 self.done = True
243 return
244
245 # If we've seen escape sequences, use the EscCharSetProber, which
246 # uses a simple state machine to check for known escape sequences in
247 # HZ and ISO-2022 encodings, since those are the only encodings that
248 # use such sequences.
249 if self._input_state == InputState.ESC_ASCII:
250 if not self._esc_charset_prober:
251 self._esc_charset_prober = EscCharSetProber(self.lang_filter)
252 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
253 self.result = {
254 "encoding": self._esc_charset_prober.charset_name,
255 "confidence": self._esc_charset_prober.get_confidence(),
256 "language": self._esc_charset_prober.language,
257 }
258 self.done = True
259 # If we've seen high bytes (i.e., those with values greater than 127),
260 # we need to do more complicated checks using all our multi-byte and
261 # single-byte probers that are left. The single-byte probers
262 # use character bigram distributions to determine the encoding, whereas
263 # the multi-byte probers use a combination of character unigram and
264 # bigram distributions.
265 elif self._input_state == InputState.HIGH_BYTE:
266 if not self._charset_probers:
267 self._charset_probers = [MBCSGroupProber(self.lang_filter)]
268 # If we're checking non-CJK encodings, use single-byte prober
269 if self.lang_filter & LanguageFilter.NON_CJK:
270 self._charset_probers.append(SBCSGroupProber())
271 self._charset_probers.append(Latin1Prober())
272 self._charset_probers.append(MacRomanProber())
273 for prober in self._charset_probers:
274 if prober.feed(byte_str) == ProbingState.FOUND_IT:
275 self.result = {
276 "encoding": prober.charset_name,
277 "confidence": prober.get_confidence(),
278 "language": prober.language,
279 }
280 self.done = True
281 break
282 if self.WIN_BYTE_DETECTOR.search(byte_str):
283 self._has_win_bytes = True
284
285 def close(self) -> ResultDict:
286 """
287 Stop analyzing the current document and come up with a final
288 prediction.
289
290 :returns: The ``result`` attribute, a ``dict`` with the keys
291 `encoding`, `confidence`, and `language`.
292 """
293 # Don't bother with checks if we're already done
294 if self.done:
295 return self.result
296 self.done = True
297
298 if not self._got_data:
299 self.logger.debug("no data received!")
300
301 # Default to ASCII if it is all we've seen so far
302 elif self._input_state == InputState.PURE_ASCII:
303 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
304
305 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
306 elif self._input_state == InputState.HIGH_BYTE:
307 prober_confidence = None
308 max_prober_confidence = 0.0
309 max_prober = None
310 for prober in self._charset_probers:
311 if not prober:
312 continue
313 prober_confidence = prober.get_confidence()
314 if prober_confidence > max_prober_confidence:
315 max_prober_confidence = prober_confidence
316 max_prober = prober
317 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
318 charset_name = max_prober.charset_name
319 assert charset_name is not None
320 lower_charset_name = charset_name.lower()
321 confidence = max_prober.get_confidence()
322 # Use Windows encoding name instead of ISO-8859 if we saw any
323 # extra Windows-specific bytes
324 if lower_charset_name.startswith("iso-8859"):
325 if self._has_win_bytes:
326 charset_name = self.ISO_WIN_MAP.get(
327 lower_charset_name, charset_name
328 )
329 # Rename legacy encodings with superset encodings if asked
330 if self.should_rename_legacy:
331 charset_name = self.LEGACY_MAP.get(
332 (charset_name or "").lower(), charset_name
333 )
334 self.result = {
335 "encoding": charset_name,
336 "confidence": confidence,
337 "language": max_prober.language,
338 }
339
340 # Log all prober confidences if none met MINIMUM_THRESHOLD
341 if self.logger.getEffectiveLevel() <= logging.DEBUG:
342 if self.result["encoding"] is None:
343 self.logger.debug("no probers hit minimum threshold")
344 for group_prober in self._charset_probers:
345 if not group_prober:
346 continue
347 if isinstance(group_prober, CharSetGroupProber):
348 for prober in group_prober.probers:
349 self.logger.debug(
350 "%s %s confidence = %s",
351 prober.charset_name,
352 prober.language,
353 prober.get_confidence(),
354 )
355 else:
356 self.logger.debug(
357 "%s %s confidence = %s",
358 group_prober.charset_name,
359 group_prober.language,
360 group_prober.get_confidence(),
361 )
362 return self.result