Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/universaldetector.py: 88%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, write to the Free Software

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

26# 02110-1301 USA

27######################### END LICENSE BLOCK #########################

28"""

29Module containing the UniversalDetector detector class, which is the primary

30class a user of ``chardet`` should use.

32:author: Mark Pilgrim (initial port to Python)

33:author: Shy Shalom (original C code)

34:author: Dan Blanchard (major refactoring for 3.0)

35:author: Ian Cordasco

36"""

39import codecs

40import logging

41import re

42from typing import List, Optional, Union

44from .charsetgroupprober import CharSetGroupProber

45from .charsetprober import CharSetProber

46from .enums import InputState, LanguageFilter, ProbingState

47from .escprober import EscCharSetProber

48from .latin1prober import Latin1Prober

49from .macromanprober import MacRomanProber

50from .mbcsgroupprober import MBCSGroupProber

51from .resultdict import ResultDict

52from .sbcsgroupprober import SBCSGroupProber

53from .utf1632prober import UTF1632Prober

56class UniversalDetector:

57 """

58 The ``UniversalDetector`` class underlies the ``chardet.detect`` function

59 and coordinates all of the different charset probers.

61 To get a ``dict`` containing an encoding and its confidence, you can simply

62 run:

64 .. code::

66 u = UniversalDetector()

67 u.feed(some_bytes)

68 u.close()

69 detected = u.result

71 """

73 MINIMUM_THRESHOLD = 0.20

74 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")

75 ESC_DETECTOR = re.compile(b"(\033|~{)")

76 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")

77 ISO_WIN_MAP = {

78 "iso-8859-1": "Windows-1252",

79 "iso-8859-2": "Windows-1250",

80 "iso-8859-5": "Windows-1251",

81 "iso-8859-6": "Windows-1256",

82 "iso-8859-7": "Windows-1253",

83 "iso-8859-8": "Windows-1255",

84 "iso-8859-9": "Windows-1254",

85 "iso-8859-13": "Windows-1257",

86 }

87 # Based on https://encoding.spec.whatwg.org/#names-and-labels

88 # but altered to match Python names for encodings and remove mappings

89 # that break tests.

90 LEGACY_MAP = {

91 "ascii": "Windows-1252",

92 "iso-8859-1": "Windows-1252",

93 "tis-620": "ISO-8859-11",

94 "iso-8859-9": "Windows-1254",

95 "gb2312": "GB18030",

96 "euc-kr": "CP949",

97 "utf-16le": "UTF-16",

98 }

100 def __init__(

101 self,

102 lang_filter: LanguageFilter = LanguageFilter.ALL,

103 should_rename_legacy: bool = False,

104 ) -> None:

105 self._esc_charset_prober: Optional[EscCharSetProber] = None

106 self._utf1632_prober: Optional[UTF1632Prober] = None

107 self._charset_probers: List[CharSetProber] = []

108 self.result: ResultDict = {

109 "encoding": None,

110 "confidence": 0.0,

111 "language": None,

112 }

113 self.done = False

114 self._got_data = False

115 self._input_state = InputState.PURE_ASCII

116 self._last_char = b""

117 self.lang_filter = lang_filter

118 self.logger = logging.getLogger(__name__)

119 self._has_win_bytes = False

120 self.should_rename_legacy = should_rename_legacy

121 self.reset()

122

123 @property

124 def input_state(self) -> int:

125 return self._input_state

126

127 @property

128 def has_win_bytes(self) -> bool:

129 return self._has_win_bytes

130

131 @property

132 def charset_probers(self) -> List[CharSetProber]:

133 return self._charset_probers

134

135 def reset(self) -> None:

136 """

137 Reset the UniversalDetector and all of its probers back to their

138 initial states. This is called by ``__init__``, so you only need to

139 call this directly in between analyses of different documents.

140 """

141 self.result = {"encoding": None, "confidence": 0.0, "language": None}

142 self.done = False

143 self._got_data = False

144 self._has_win_bytes = False

145 self._input_state = InputState.PURE_ASCII

146 self._last_char = b""

147 if self._esc_charset_prober:

148 self._esc_charset_prober.reset()

149 if self._utf1632_prober:

150 self._utf1632_prober.reset()

151 for prober in self._charset_probers:

152 prober.reset()

153

154 def feed(self, byte_str: Union[bytes, bytearray]) -> None:

155 """

156 Takes a chunk of a document and feeds it through all of the relevant

157 charset probers.

158

159 After calling ``feed``, you can check the value of the ``done``

160 attribute to see if you need to continue feeding the

161 ``UniversalDetector`` more data, or if it has made a prediction

162 (in the ``result`` attribute).

163

164 .. note::

165 You should always call ``close`` when you're done feeding in your

166 document if ``done`` is not already ``True``.

167 """

168 if self.done:

169 return

170

171 if not byte_str:

172 return

173

174 if not isinstance(byte_str, bytearray):

175 byte_str = bytearray(byte_str)

176

177 # First check for known BOMs, since these are guaranteed to be correct

178 if not self._got_data:

179 # If the data starts with BOM, we know it is UTF

180 if byte_str.startswith(codecs.BOM_UTF8):

181 # EF BB BF UTF-8 with BOM

182 self.result = {

183 "encoding": "UTF-8-SIG",

184 "confidence": 1.0,

185 "language": "",

186 }

187 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):

188 # FF FE 00 00 UTF-32, little-endian BOM

189 # 00 00 FE FF UTF-32, big-endian BOM

190 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}

191 elif byte_str.startswith(b"\xFE\xFF\x00\x00"):

192 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)

193 self.result = {

194 # TODO: This encoding is not supported by Python. Should remove?

195 "encoding": "X-ISO-10646-UCS-4-3412",

196 "confidence": 1.0,

197 "language": "",

198 }

199 elif byte_str.startswith(b"\x00\x00\xFF\xFE"):

200 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)

201 self.result = {

202 # TODO: This encoding is not supported by Python. Should remove?

203 "encoding": "X-ISO-10646-UCS-4-2143",

204 "confidence": 1.0,

205 "language": "",

206 }

207 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):

208 # FF FE UTF-16, little endian BOM

209 # FE FF UTF-16, big endian BOM

210 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}

211

212 self._got_data = True

213 if self.result["encoding"] is not None:

214 self.done = True

215 return

216

217 # If none of those matched and we've only see ASCII so far, check

218 # for high bytes and escape sequences

219 if self._input_state == InputState.PURE_ASCII:

220 if self.HIGH_BYTE_DETECTOR.search(byte_str):

221 self._input_state = InputState.HIGH_BYTE

222 elif (

223 self._input_state == InputState.PURE_ASCII

224 and self.ESC_DETECTOR.search(self._last_char + byte_str)

225 ):

226 self._input_state = InputState.ESC_ASCII

227

228 self._last_char = byte_str[-1:]

229

230 # next we will look to see if it is appears to be either a UTF-16 or

231 # UTF-32 encoding

232 if not self._utf1632_prober:

233 self._utf1632_prober = UTF1632Prober()

234

235 if self._utf1632_prober.state == ProbingState.DETECTING:

236 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:

237 self.result = {

238 "encoding": self._utf1632_prober.charset_name,

239 "confidence": self._utf1632_prober.get_confidence(),

240 "language": "",

241 }

242 self.done = True

243 return

244

245 # If we've seen escape sequences, use the EscCharSetProber, which

246 # uses a simple state machine to check for known escape sequences in

247 # HZ and ISO-2022 encodings, since those are the only encodings that

248 # use such sequences.

249 if self._input_state == InputState.ESC_ASCII:

250 if not self._esc_charset_prober:

251 self._esc_charset_prober = EscCharSetProber(self.lang_filter)

252 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:

253 self.result = {

254 "encoding": self._esc_charset_prober.charset_name,

255 "confidence": self._esc_charset_prober.get_confidence(),

256 "language": self._esc_charset_prober.language,

257 }

258 self.done = True

259 # If we've seen high bytes (i.e., those with values greater than 127),

260 # we need to do more complicated checks using all our multi-byte and

261 # single-byte probers that are left. The single-byte probers

262 # use character bigram distributions to determine the encoding, whereas

263 # the multi-byte probers use a combination of character unigram and

264 # bigram distributions.

265 elif self._input_state == InputState.HIGH_BYTE:

266 if not self._charset_probers:

267 self._charset_probers = [MBCSGroupProber(self.lang_filter)]

268 # If we're checking non-CJK encodings, use single-byte prober

269 if self.lang_filter & LanguageFilter.NON_CJK:

270 self._charset_probers.append(SBCSGroupProber())

271 self._charset_probers.append(Latin1Prober())

272 self._charset_probers.append(MacRomanProber())

273 for prober in self._charset_probers:

274 if prober.feed(byte_str) == ProbingState.FOUND_IT:

275 self.result = {

276 "encoding": prober.charset_name,

277 "confidence": prober.get_confidence(),

278 "language": prober.language,

279 }

280 self.done = True

281 break

282 if self.WIN_BYTE_DETECTOR.search(byte_str):

283 self._has_win_bytes = True

284

285 def close(self) -> ResultDict:

286 """

287 Stop analyzing the current document and come up with a final

288 prediction.

289

290 :returns: The ``result`` attribute, a ``dict`` with the keys

291 `encoding`, `confidence`, and `language`.

292 """

293 # Don't bother with checks if we're already done

294 if self.done:

295 return self.result

296 self.done = True

297

298 if not self._got_data:

299 self.logger.debug("no data received!")

300

301 # Default to ASCII if it is all we've seen so far

302 elif self._input_state == InputState.PURE_ASCII:

303 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}

304

305 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD

306 elif self._input_state == InputState.HIGH_BYTE:

307 prober_confidence = None

308 max_prober_confidence = 0.0

309 max_prober = None

310 for prober in self._charset_probers:

311 if not prober:

312 continue

313 prober_confidence = prober.get_confidence()

314 if prober_confidence > max_prober_confidence:

315 max_prober_confidence = prober_confidence

316 max_prober = prober

317 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):

318 charset_name = max_prober.charset_name

319 assert charset_name is not None

320 lower_charset_name = charset_name.lower()

321 confidence = max_prober.get_confidence()

322 # Use Windows encoding name instead of ISO-8859 if we saw any

323 # extra Windows-specific bytes

324 if lower_charset_name.startswith("iso-8859"):

325 if self._has_win_bytes:

326 charset_name = self.ISO_WIN_MAP.get(

327 lower_charset_name, charset_name

328 )

329 # Rename legacy encodings with superset encodings if asked

330 if self.should_rename_legacy:

331 charset_name = self.LEGACY_MAP.get(

332 (charset_name or "").lower(), charset_name

333 )

334 self.result = {

335 "encoding": charset_name,

336 "confidence": confidence,

337 "language": max_prober.language,

338 }

339

340 # Log all prober confidences if none met MINIMUM_THRESHOLD

341 if self.logger.getEffectiveLevel() <= logging.DEBUG:

342 if self.result["encoding"] is None:

343 self.logger.debug("no probers hit minimum threshold")

344 for group_prober in self._charset_probers:

345 if not group_prober:

346 continue

347 if isinstance(group_prober, CharSetGroupProber):

348 for prober in group_prober.probers:

349 self.logger.debug(

350 "%s %s confidence = %s",

351 prober.charset_name,

352 prober.language,

353 prober.get_confidence(),

354 )

355 else:

356 self.logger.debug(

357 "%s %s confidence = %s",

358 group_prober.charset_name,

359 group_prober.language,

360 group_prober.get_confidence(),

361 )

362 return self.result