Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/universaldetector.py: 88%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, see

25# <https://www.gnu.org/licenses/>.

26######################### END LICENSE BLOCK #########################

27"""

28Module containing the UniversalDetector detector class, which is the primary

29class a user of ``chardet`` should use.

31:author: Mark Pilgrim (initial port to Python)

32:author: Shy Shalom (original C code)

33:author: Dan Blanchard (major refactoring for 3.0)

34:author: Ian Cordasco

35"""

37import codecs

38import logging

39import re

40from typing import List, Optional, Union

42from .charsetgroupprober import CharSetGroupProber

43from .charsetprober import CharSetProber

44from .enums import InputState, LanguageFilter, ProbingState

45from .escprober import EscCharSetProber

46from .latin1prober import Latin1Prober

47from .macromanprober import MacRomanProber

48from .mbcsgroupprober import MBCSGroupProber

49from .resultdict import ResultDict

50from .sbcsgroupprober import SBCSGroupProber

51from .utf1632prober import UTF1632Prober

54class UniversalDetector:

55 """

56 The ``UniversalDetector`` class underlies the ``chardet.detect`` function

57 and coordinates all of the different charset probers.

59 To get a ``dict`` containing an encoding and its confidence, you can simply

60 run:

62 .. code::

64 u = UniversalDetector()

65 u.feed(some_bytes)

66 u.close()

67 detected = u.result

69 """

71 MINIMUM_THRESHOLD = 0.20

72 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]")

73 ESC_DETECTOR = re.compile(b"(\033|~{)")

74 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")

75 ISO_WIN_MAP = {

76 "iso-8859-1": "Windows-1252",

77 "iso-8859-2": "Windows-1250",

78 "iso-8859-5": "Windows-1251",

79 "iso-8859-6": "Windows-1256",

80 "iso-8859-7": "Windows-1253",

81 "iso-8859-8": "Windows-1255",

82 "iso-8859-9": "Windows-1254",

83 "iso-8859-13": "Windows-1257",

84 }

85 # Based on https://encoding.spec.whatwg.org/#names-and-labels

86 # but altered to match Python names for encodings and remove mappings

87 # that break tests.

88 LEGACY_MAP = {

89 "ascii": "Windows-1252",

90 "iso-8859-1": "Windows-1252",

91 "tis-620": "ISO-8859-11",

92 "iso-8859-9": "Windows-1254",

93 "gb2312": "GB18030",

94 "euc-kr": "CP949",

95 "utf-16le": "UTF-16",

96 }

98 def __init__(

99 self,

100 lang_filter: LanguageFilter = LanguageFilter.ALL,

101 should_rename_legacy: bool = False,

102 ) -> None:

103 self._esc_charset_prober: Optional[EscCharSetProber] = None

104 self._utf1632_prober: Optional[UTF1632Prober] = None

105 self._charset_probers: List[CharSetProber] = []

106 self.result: ResultDict = {

107 "encoding": None,

108 "confidence": 0.0,

109 "language": None,

110 }

111 self.done = False

112 self._got_data = False

113 self._input_state = InputState.PURE_ASCII

114 self._last_char = b""

115 self.lang_filter = lang_filter

116 self.logger = logging.getLogger(__name__)

117 self._has_win_bytes = False

118 self.should_rename_legacy = should_rename_legacy

119 self.reset()

120

121 @property

122 def input_state(self) -> int:

123 return self._input_state

124

125 @property

126 def has_win_bytes(self) -> bool:

127 return self._has_win_bytes

128

129 @property

130 def charset_probers(self) -> List[CharSetProber]:

131 return self._charset_probers

132

133 def reset(self) -> None:

134 """

135 Reset the UniversalDetector and all of its probers back to their

136 initial states. This is called by ``__init__``, so you only need to

137 call this directly in between analyses of different documents.

138 """

139 self.result = {"encoding": None, "confidence": 0.0, "language": None}

140 self.done = False

141 self._got_data = False

142 self._has_win_bytes = False

143 self._input_state = InputState.PURE_ASCII

144 self._last_char = b""

145 if self._esc_charset_prober:

146 self._esc_charset_prober.reset()

147 if self._utf1632_prober:

148 self._utf1632_prober.reset()

149 for prober in self._charset_probers:

150 prober.reset()

151

152 def feed(self, byte_str: Union[bytes, bytearray]) -> None:

153 """

154 Takes a chunk of a document and feeds it through all of the relevant

155 charset probers.

156

157 After calling ``feed``, you can check the value of the ``done``

158 attribute to see if you need to continue feeding the

159 ``UniversalDetector`` more data, or if it has made a prediction

160 (in the ``result`` attribute).

161

162 .. note::

163 You should always call ``close`` when you're done feeding in your

164 document if ``done`` is not already ``True``.

165 """

166 if self.done:

167 return

168

169 if not byte_str:

170 return

171

172 if not isinstance(byte_str, bytearray):

173 byte_str = bytearray(byte_str)

174

175 # First check for known BOMs, since these are guaranteed to be correct

176 if not self._got_data:

177 # If the data starts with BOM, we know it is UTF

178 if byte_str.startswith(codecs.BOM_UTF8):

179 # EF BB BF UTF-8 with BOM

180 self.result = {

181 "encoding": "UTF-8-SIG",

182 "confidence": 1.0,

183 "language": "",

184 }

185 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):

186 # FF FE 00 00 UTF-32, little-endian BOM

187 # 00 00 FE FF UTF-32, big-endian BOM

188 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}

189 elif byte_str.startswith(b"\xfe\xff\x00\x00"):

190 # FE FF 00 00 UCS-4, unusual octet order BOM (3412)

191 self.result = {

192 # TODO: This encoding is not supported by Python. Should remove?

193 "encoding": "X-ISO-10646-UCS-4-3412",

194 "confidence": 1.0,

195 "language": "",

196 }

197 elif byte_str.startswith(b"\x00\x00\xff\xfe"):

198 # 00 00 FF FE UCS-4, unusual octet order BOM (2143)

199 self.result = {

200 # TODO: This encoding is not supported by Python. Should remove?

201 "encoding": "X-ISO-10646-UCS-4-2143",

202 "confidence": 1.0,

203 "language": "",

204 }

205 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):

206 # FF FE UTF-16, little endian BOM

207 # FE FF UTF-16, big endian BOM

208 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}

209

210 self._got_data = True

211 if self.result["encoding"] is not None:

212 self.done = True

213 return

214

215 # If none of those matched and we've only see ASCII so far, check

216 # for high bytes and escape sequences

217 if self._input_state == InputState.PURE_ASCII:

218 if self.HIGH_BYTE_DETECTOR.search(byte_str):

219 self._input_state = InputState.HIGH_BYTE

220 elif (

221 self._input_state == InputState.PURE_ASCII

222 and self.ESC_DETECTOR.search(self._last_char + byte_str)

223 ):

224 self._input_state = InputState.ESC_ASCII

225

226 self._last_char = byte_str[-1:]

227

228 # next we will look to see if it is appears to be either a UTF-16 or

229 # UTF-32 encoding

230 if not self._utf1632_prober:

231 self._utf1632_prober = UTF1632Prober()

232

233 if self._utf1632_prober.state == ProbingState.DETECTING:

234 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:

235 self.result = {

236 "encoding": self._utf1632_prober.charset_name,

237 "confidence": self._utf1632_prober.get_confidence(),

238 "language": "",

239 }

240 self.done = True

241 return

242

243 # If we've seen escape sequences, use the EscCharSetProber, which

244 # uses a simple state machine to check for known escape sequences in

245 # HZ and ISO-2022 encodings, since those are the only encodings that

246 # use such sequences.

247 if self._input_state == InputState.ESC_ASCII:

248 if not self._esc_charset_prober:

249 self._esc_charset_prober = EscCharSetProber(self.lang_filter)

250 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:

251 self.result = {

252 "encoding": self._esc_charset_prober.charset_name,

253 "confidence": self._esc_charset_prober.get_confidence(),

254 "language": self._esc_charset_prober.language,

255 }

256 self.done = True

257 # If we've seen high bytes (i.e., those with values greater than 127),

258 # we need to do more complicated checks using all our multi-byte and

259 # single-byte probers that are left. The single-byte probers

260 # use character bigram distributions to determine the encoding, whereas

261 # the multi-byte probers use a combination of character unigram and

262 # bigram distributions.

263 elif self._input_state == InputState.HIGH_BYTE:

264 if not self._charset_probers:

265 self._charset_probers = [MBCSGroupProber(self.lang_filter)]

266 # If we're checking non-CJK encodings, use single-byte prober

267 if self.lang_filter & LanguageFilter.NON_CJK:

268 self._charset_probers.append(SBCSGroupProber())

269 self._charset_probers.append(Latin1Prober())

270 self._charset_probers.append(MacRomanProber())

271 for prober in self._charset_probers:

272 if prober.feed(byte_str) == ProbingState.FOUND_IT:

273 self.result = {

274 "encoding": prober.charset_name,

275 "confidence": prober.get_confidence(),

276 "language": prober.language,

277 }

278 self.done = True

279 break

280 if self.WIN_BYTE_DETECTOR.search(byte_str):

281 self._has_win_bytes = True

282

283 def close(self) -> ResultDict:

284 """

285 Stop analyzing the current document and come up with a final

286 prediction.

287

288 :returns: The ``result`` attribute, a ``dict`` with the keys

289 `encoding`, `confidence`, and `language`.

290 """

291 # Don't bother with checks if we're already done

292 if self.done:

293 return self.result

294 self.done = True

295

296 if not self._got_data:

297 self.logger.debug("no data received!")

298

299 # Default to ASCII if it is all we've seen so far

300 elif self._input_state == InputState.PURE_ASCII:

301 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}

302

303 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD

304 elif self._input_state == InputState.HIGH_BYTE:

305 prober_confidence = None

306 max_prober_confidence = 0.0

307 max_prober = None

308 for prober in self._charset_probers:

309 if not prober:

310 continue

311 prober_confidence = prober.get_confidence()

312 if prober_confidence > max_prober_confidence:

313 max_prober_confidence = prober_confidence

314 max_prober = prober

315 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):

316 charset_name = max_prober.charset_name

317 assert charset_name is not None

318 lower_charset_name = charset_name.lower()

319 confidence = max_prober.get_confidence()

320 # Use Windows encoding name instead of ISO-8859 if we saw any

321 # extra Windows-specific bytes

322 if lower_charset_name.startswith("iso-8859"):

323 if self._has_win_bytes:

324 charset_name = self.ISO_WIN_MAP.get(

325 lower_charset_name, charset_name

326 )

327 # Rename legacy encodings with superset encodings if asked

328 if self.should_rename_legacy:

329 charset_name = self.LEGACY_MAP.get(

330 (charset_name or "").lower(), charset_name

331 )

332 self.result = {

333 "encoding": charset_name,

334 "confidence": confidence,

335 "language": max_prober.language,

336 }

337

338 # Log all prober confidences if none met MINIMUM_THRESHOLD

339 if self.logger.getEffectiveLevel() <= logging.DEBUG:

340 if self.result["encoding"] is None:

341 self.logger.debug("no probers hit minimum threshold")

342 for group_prober in self._charset_probers:

343 if not group_prober:

344 continue

345 if isinstance(group_prober, CharSetGroupProber):

346 for prober in group_prober.probers:

347 self.logger.debug(

348 "%s %s confidence = %s",

349 prober.charset_name,

350 prober.language,

351 prober.get_confidence(),

352 )

353 else:

354 self.logger.debug(

355 "%s %s confidence = %s",

356 group_prober.charset_name,

357 group_prober.language,

358 group_prober.get_confidence(),

359 )

360 return self.result