Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/universaldetector.py: 88%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, write to the Free Software

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

26# 02110-1301 USA

27######################### END LICENSE BLOCK #########################

28"""

29Module containing the UniversalDetector detector class, which is the primary

30class a user of ``chardet`` should use.

32:author: Mark Pilgrim (initial port to Python)

33:author: Shy Shalom (original C code)

34:author: Dan Blanchard (major refactoring for 3.0)

35:author: Ian Cordasco

36"""

38import codecs

39import logging

40import re

41from typing import List, Optional, Union

43from .charsetgroupprober import CharSetGroupProber

44from .charsetprober import CharSetProber

45from .enums import InputState, LanguageFilter, ProbingState

46from .escprober import EscCharSetProber

47from .latin1prober import Latin1Prober

48from .macromanprober import MacRomanProber

49from .mbcsgroupprober import MBCSGroupProber

50from .resultdict import ResultDict

51from .sbcsgroupprober import SBCSGroupProber

52from .utf1632prober import UTF1632Prober

55class UniversalDetector:

56 """

57 The ``UniversalDetector`` class underlies the ``chardet.detect`` function

58 and coordinates all of the different charset probers.

60 To get a ``dict`` containing an encoding and its confidence, you can simply

61 run:

63 .. code::

65 u = UniversalDetector()

66 u.feed(some_bytes)

67 u.close()

68 detected = u.result

70 """

72 MINIMUM_THRESHOLD = 0.20

73 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]")

74 ESC_DETECTOR = re.compile(b"(\033|~{)")

75 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")

76 ISO_WIN_MAP = {

77 "iso-8859-1": "Windows-1252",

78 "iso-8859-2": "Windows-1250",

79 "iso-8859-5": "Windows-1251",

80 "iso-8859-6": "Windows-1256",

81 "iso-8859-7": "Windows-1253",

82 "iso-8859-8": "Windows-1255",

83 "iso-8859-9": "Windows-1254",

84 "iso-8859-13": "Windows-1257",

85 }

86 # Based on https://encoding.spec.whatwg.org/#names-and-labels

87 # but altered to match Python names for encodings and remove mappings

88 # that break tests.

89 LEGACY_MAP = {

90 "ascii": "Windows-1252",

91 "iso-8859-1": "Windows-1252",

92 "tis-620": "ISO-8859-11",

93 "iso-8859-9": "Windows-1254",

94 "gb2312": "GB18030",

95 "euc-kr": "CP949",

96 "utf-16le": "UTF-16",

97 }

99 def __init__(

100 self,

101 lang_filter: LanguageFilter = LanguageFilter.ALL,

102 should_rename_legacy: bool = False,

103 ) -> None:

104 self._esc_charset_prober: Optional[EscCharSetProber] = None

105 self._utf1632_prober: Optional[UTF1632Prober] = None

106 self._charset_probers: List[CharSetProber] = []

107 self.result: ResultDict = {

108 "encoding": None,

109 "confidence": 0.0,

110 "language": None,

111 }

112 self.done = False

113 self._got_data = False

114 self._input_state = InputState.PURE_ASCII

115 self._last_char = b""

116 self.lang_filter = lang_filter

117 self.logger = logging.getLogger(__name__)

118 self._has_win_bytes = False

119 self.should_rename_legacy = should_rename_legacy

120 self.reset()

121

122 @property

123 def input_state(self) -> int:

124 return self._input_state

125

126 @property

127 def has_win_bytes(self) -> bool:

128 return self._has_win_bytes

129

130 @property

131 def charset_probers(self) -> List[CharSetProber]:

132 return self._charset_probers

133

134 def reset(self) -> None:

135 """

136 Reset the UniversalDetector and all of its probers back to their

137 initial states. This is called by ``__init__``, so you only need to

138 call this directly in between analyses of different documents.

139 """

140 self.result = {"encoding": None, "confidence": 0.0, "language": None}

141 self.done = False

142 self._got_data = False

143 self._has_win_bytes = False

144 self._input_state = InputState.PURE_ASCII

145 self._last_char = b""

146 if self._esc_charset_prober:

147 self._esc_charset_prober.reset()

148 if self._utf1632_prober:

149 self._utf1632_prober.reset()

150 for prober in self._charset_probers:

151 prober.reset()

152

153 def feed(self, byte_str: Union[bytes, bytearray]) -> None:

154 """

155 Takes a chunk of a document and feeds it through all of the relevant

156 charset probers.

157

158 After calling ``feed``, you can check the value of the ``done``

159 attribute to see if you need to continue feeding the

160 ``UniversalDetector`` more data, or if it has made a prediction

161 (in the ``result`` attribute).

162

163 .. note::

164 You should always call ``close`` when you're done feeding in your

165 document if ``done`` is not already ``True``.

166 """

167 if self.done:

168 return

169

170 if not byte_str:

171 return

172

173 if not isinstance(byte_str, bytearray):

174 byte_str = bytearray(byte_str)

175

176 # First check for known BOMs, since these are guaranteed to be correct

177 if not self._got_data:

178 # If the data starts with BOM, we know it is UTF

179 if byte_str.startswith(codecs.BOM_UTF8):

180 # EF BB BF UTF-8 with BOM

181 self.result = {

182 "encoding": "UTF-8-SIG",

183 "confidence": 1.0,

184 "language": "",

185 }

186 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):

187 # FF FE 00 00 UTF-32, little-endian BOM

188 # 00 00 FE FF UTF-32, big-endian BOM

189 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}

190 elif byte_str.startswith(b"\xfe\xff\x00\x00"):

191 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)

192 self.result = {

193 # TODO: This encoding is not supported by Python. Should remove?

194 "encoding": "X-ISO-10646-UCS-4-3412",

195 "confidence": 1.0,

196 "language": "",

197 }

198 elif byte_str.startswith(b"\x00\x00\xff\xfe"):

199 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)

200 self.result = {

201 # TODO: This encoding is not supported by Python. Should remove?

202 "encoding": "X-ISO-10646-UCS-4-2143",

203 "confidence": 1.0,

204 "language": "",

205 }

206 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):

207 # FF FE UTF-16, little endian BOM

208 # FE FF UTF-16, big endian BOM

209 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}

210

211 self._got_data = True

212 if self.result["encoding"] is not None:

213 self.done = True

214 return

215

216 # If none of those matched and we've only see ASCII so far, check

217 # for high bytes and escape sequences

218 if self._input_state == InputState.PURE_ASCII:

219 if self.HIGH_BYTE_DETECTOR.search(byte_str):

220 self._input_state = InputState.HIGH_BYTE

221 elif (

222 self._input_state == InputState.PURE_ASCII

223 and self.ESC_DETECTOR.search(self._last_char + byte_str)

224 ):

225 self._input_state = InputState.ESC_ASCII

226

227 self._last_char = byte_str[-1:]

228

229 # next we will look to see if it is appears to be either a UTF-16 or

230 # UTF-32 encoding

231 if not self._utf1632_prober:

232 self._utf1632_prober = UTF1632Prober()

233

234 if self._utf1632_prober.state == ProbingState.DETECTING:

235 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:

236 self.result = {

237 "encoding": self._utf1632_prober.charset_name,

238 "confidence": self._utf1632_prober.get_confidence(),

239 "language": "",

240 }

241 self.done = True

242 return

243

244 # If we've seen escape sequences, use the EscCharSetProber, which

245 # uses a simple state machine to check for known escape sequences in

246 # HZ and ISO-2022 encodings, since those are the only encodings that

247 # use such sequences.

248 if self._input_state == InputState.ESC_ASCII:

249 if not self._esc_charset_prober:

250 self._esc_charset_prober = EscCharSetProber(self.lang_filter)

251 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:

252 self.result = {

253 "encoding": self._esc_charset_prober.charset_name,

254 "confidence": self._esc_charset_prober.get_confidence(),

255 "language": self._esc_charset_prober.language,

256 }

257 self.done = True

258 # If we've seen high bytes (i.e., those with values greater than 127),

259 # we need to do more complicated checks using all our multi-byte and

260 # single-byte probers that are left. The single-byte probers

261 # use character bigram distributions to determine the encoding, whereas

262 # the multi-byte probers use a combination of character unigram and

263 # bigram distributions.

264 elif self._input_state == InputState.HIGH_BYTE:

265 if not self._charset_probers:

266 self._charset_probers = [MBCSGroupProber(self.lang_filter)]

267 # If we're checking non-CJK encodings, use single-byte prober

268 if self.lang_filter & LanguageFilter.NON_CJK:

269 self._charset_probers.append(SBCSGroupProber())

270 self._charset_probers.append(Latin1Prober())

271 self._charset_probers.append(MacRomanProber())

272 for prober in self._charset_probers:

273 if prober.feed(byte_str) == ProbingState.FOUND_IT:

274 self.result = {

275 "encoding": prober.charset_name,

276 "confidence": prober.get_confidence(),

277 "language": prober.language,

278 }

279 self.done = True

280 break

281 if self.WIN_BYTE_DETECTOR.search(byte_str):

282 self._has_win_bytes = True

283

284 def close(self) -> ResultDict:

285 """

286 Stop analyzing the current document and come up with a final

287 prediction.

288

289 :returns: The ``result`` attribute, a ``dict`` with the keys

290 `encoding`, `confidence`, and `language`.

291 """

292 # Don't bother with checks if we're already done

293 if self.done:

294 return self.result

295 self.done = True

296

297 if not self._got_data:

298 self.logger.debug("no data received!")

299

300 # Default to ASCII if it is all we've seen so far

301 elif self._input_state == InputState.PURE_ASCII:

302 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}

303

304 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD

305 elif self._input_state == InputState.HIGH_BYTE:

306 prober_confidence = None

307 max_prober_confidence = 0.0

308 max_prober = None

309 for prober in self._charset_probers:

310 if not prober:

311 continue

312 prober_confidence = prober.get_confidence()

313 if prober_confidence > max_prober_confidence:

314 max_prober_confidence = prober_confidence

315 max_prober = prober

316 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):

317 charset_name = max_prober.charset_name

318 assert charset_name is not None

319 lower_charset_name = charset_name.lower()

320 confidence = max_prober.get_confidence()

321 # Use Windows encoding name instead of ISO-8859 if we saw any

322 # extra Windows-specific bytes

323 if lower_charset_name.startswith("iso-8859"):

324 if self._has_win_bytes:

325 charset_name = self.ISO_WIN_MAP.get(

326 lower_charset_name, charset_name

327 )

328 # Rename legacy encodings with superset encodings if asked

329 if self.should_rename_legacy:

330 charset_name = self.LEGACY_MAP.get(

331 (charset_name or "").lower(), charset_name

332 )

333 self.result = {

334 "encoding": charset_name,

335 "confidence": confidence,

336 "language": max_prober.language,

337 }

338

339 # Log all prober confidences if none met MINIMUM_THRESHOLD

340 if self.logger.getEffectiveLevel() <= logging.DEBUG:

341 if self.result["encoding"] is None:

342 self.logger.debug("no probers hit minimum threshold")

343 for group_prober in self._charset_probers:

344 if not group_prober:

345 continue

346 if isinstance(group_prober, CharSetGroupProber):

347 for prober in group_prober.probers:

348 self.logger.debug(

349 "%s %s confidence = %s",

350 prober.charset_name,

351 prober.language,

352 prober.get_confidence(),

353 )

354 else:

355 self.logger.debug(

356 "%s %s confidence = %s",

357 group_prober.charset_name,

358 group_prober.language,

359 group_prober.get_confidence(),

360 )

361 return self.result