Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/universaldetector.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

154 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28""" 

29Module containing the UniversalDetector detector class, which is the primary 

30class a user of ``chardet`` should use. 

31 

32:author: Mark Pilgrim (initial port to Python) 

33:author: Shy Shalom (original C code) 

34:author: Dan Blanchard (major refactoring for 3.0) 

35:author: Ian Cordasco 

36""" 

37 

38 

39import codecs 

40import logging 

41import re 

42from typing import List, Optional, Union 

43 

44from .charsetgroupprober import CharSetGroupProber 

45from .charsetprober import CharSetProber 

46from .enums import InputState, LanguageFilter, ProbingState 

47from .escprober import EscCharSetProber 

48from .latin1prober import Latin1Prober 

49from .macromanprober import MacRomanProber 

50from .mbcsgroupprober import MBCSGroupProber 

51from .resultdict import ResultDict 

52from .sbcsgroupprober import SBCSGroupProber 

53from .utf1632prober import UTF1632Prober 

54 

55 

56class UniversalDetector: 

57 """ 

58 The ``UniversalDetector`` class underlies the ``chardet.detect`` function 

59 and coordinates all of the different charset probers. 

60 

61 To get a ``dict`` containing an encoding and its confidence, you can simply 

62 run: 

63 

64 .. code:: 

65 

66 u = UniversalDetector() 

67 u.feed(some_bytes) 

68 u.close() 

69 detected = u.result 

70 

71 """ 

72 

73 MINIMUM_THRESHOLD = 0.20 

74 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]") 

75 ESC_DETECTOR = re.compile(b"(\033|~{)") 

76 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]") 

77 ISO_WIN_MAP = { 

78 "iso-8859-1": "Windows-1252", 

79 "iso-8859-2": "Windows-1250", 

80 "iso-8859-5": "Windows-1251", 

81 "iso-8859-6": "Windows-1256", 

82 "iso-8859-7": "Windows-1253", 

83 "iso-8859-8": "Windows-1255", 

84 "iso-8859-9": "Windows-1254", 

85 "iso-8859-13": "Windows-1257", 

86 } 

87 # Based on https://encoding.spec.whatwg.org/#names-and-labels 

88 # but altered to match Python names for encodings and remove mappings 

89 # that break tests. 

90 LEGACY_MAP = { 

91 "ascii": "Windows-1252", 

92 "iso-8859-1": "Windows-1252", 

93 "tis-620": "ISO-8859-11", 

94 "iso-8859-9": "Windows-1254", 

95 "gb2312": "GB18030", 

96 "euc-kr": "CP949", 

97 "utf-16le": "UTF-16", 

98 } 

99 

100 def __init__( 

101 self, 

102 lang_filter: LanguageFilter = LanguageFilter.ALL, 

103 should_rename_legacy: bool = False, 

104 ) -> None: 

105 self._esc_charset_prober: Optional[EscCharSetProber] = None 

106 self._utf1632_prober: Optional[UTF1632Prober] = None 

107 self._charset_probers: List[CharSetProber] = [] 

108 self.result: ResultDict = { 

109 "encoding": None, 

110 "confidence": 0.0, 

111 "language": None, 

112 } 

113 self.done = False 

114 self._got_data = False 

115 self._input_state = InputState.PURE_ASCII 

116 self._last_char = b"" 

117 self.lang_filter = lang_filter 

118 self.logger = logging.getLogger(__name__) 

119 self._has_win_bytes = False 

120 self.should_rename_legacy = should_rename_legacy 

121 self.reset() 

122 

123 @property 

124 def input_state(self) -> int: 

125 return self._input_state 

126 

127 @property 

128 def has_win_bytes(self) -> bool: 

129 return self._has_win_bytes 

130 

131 @property 

132 def charset_probers(self) -> List[CharSetProber]: 

133 return self._charset_probers 

134 

135 def reset(self) -> None: 

136 """ 

137 Reset the UniversalDetector and all of its probers back to their 

138 initial states. This is called by ``__init__``, so you only need to 

139 call this directly in between analyses of different documents. 

140 """ 

141 self.result = {"encoding": None, "confidence": 0.0, "language": None} 

142 self.done = False 

143 self._got_data = False 

144 self._has_win_bytes = False 

145 self._input_state = InputState.PURE_ASCII 

146 self._last_char = b"" 

147 if self._esc_charset_prober: 

148 self._esc_charset_prober.reset() 

149 if self._utf1632_prober: 

150 self._utf1632_prober.reset() 

151 for prober in self._charset_probers: 

152 prober.reset() 

153 

154 def feed(self, byte_str: Union[bytes, bytearray]) -> None: 

155 """ 

156 Takes a chunk of a document and feeds it through all of the relevant 

157 charset probers. 

158 

159 After calling ``feed``, you can check the value of the ``done`` 

160 attribute to see if you need to continue feeding the 

161 ``UniversalDetector`` more data, or if it has made a prediction 

162 (in the ``result`` attribute). 

163 

164 .. note:: 

165 You should always call ``close`` when you're done feeding in your 

166 document if ``done`` is not already ``True``. 

167 """ 

168 if self.done: 

169 return 

170 

171 if not byte_str: 

172 return 

173 

174 if not isinstance(byte_str, bytearray): 

175 byte_str = bytearray(byte_str) 

176 

177 # First check for known BOMs, since these are guaranteed to be correct 

178 if not self._got_data: 

179 # If the data starts with BOM, we know it is UTF 

180 if byte_str.startswith(codecs.BOM_UTF8): 

181 # EF BB BF UTF-8 with BOM 

182 self.result = { 

183 "encoding": "UTF-8-SIG", 

184 "confidence": 1.0, 

185 "language": "", 

186 } 

187 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): 

188 # FF FE 00 00 UTF-32, little-endian BOM 

189 # 00 00 FE FF UTF-32, big-endian BOM 

190 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""} 

191 elif byte_str.startswith(b"\xFE\xFF\x00\x00"): 

192 # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 

193 self.result = { 

194 # TODO: This encoding is not supported by Python. Should remove? 

195 "encoding": "X-ISO-10646-UCS-4-3412", 

196 "confidence": 1.0, 

197 "language": "", 

198 } 

199 elif byte_str.startswith(b"\x00\x00\xFF\xFE"): 

200 # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 

201 self.result = { 

202 # TODO: This encoding is not supported by Python. Should remove? 

203 "encoding": "X-ISO-10646-UCS-4-2143", 

204 "confidence": 1.0, 

205 "language": "", 

206 } 

207 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 

208 # FF FE UTF-16, little endian BOM 

209 # FE FF UTF-16, big endian BOM 

210 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""} 

211 

212 self._got_data = True 

213 if self.result["encoding"] is not None: 

214 self.done = True 

215 return 

216 

217 # If none of those matched and we've only see ASCII so far, check 

218 # for high bytes and escape sequences 

219 if self._input_state == InputState.PURE_ASCII: 

220 if self.HIGH_BYTE_DETECTOR.search(byte_str): 

221 self._input_state = InputState.HIGH_BYTE 

222 elif ( 

223 self._input_state == InputState.PURE_ASCII 

224 and self.ESC_DETECTOR.search(self._last_char + byte_str) 

225 ): 

226 self._input_state = InputState.ESC_ASCII 

227 

228 self._last_char = byte_str[-1:] 

229 

230 # next we will look to see if it is appears to be either a UTF-16 or 

231 # UTF-32 encoding 

232 if not self._utf1632_prober: 

233 self._utf1632_prober = UTF1632Prober() 

234 

235 if self._utf1632_prober.state == ProbingState.DETECTING: 

236 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT: 

237 self.result = { 

238 "encoding": self._utf1632_prober.charset_name, 

239 "confidence": self._utf1632_prober.get_confidence(), 

240 "language": "", 

241 } 

242 self.done = True 

243 return 

244 

245 # If we've seen escape sequences, use the EscCharSetProber, which 

246 # uses a simple state machine to check for known escape sequences in 

247 # HZ and ISO-2022 encodings, since those are the only encodings that 

248 # use such sequences. 

249 if self._input_state == InputState.ESC_ASCII: 

250 if not self._esc_charset_prober: 

251 self._esc_charset_prober = EscCharSetProber(self.lang_filter) 

252 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 

253 self.result = { 

254 "encoding": self._esc_charset_prober.charset_name, 

255 "confidence": self._esc_charset_prober.get_confidence(), 

256 "language": self._esc_charset_prober.language, 

257 } 

258 self.done = True 

259 # If we've seen high bytes (i.e., those with values greater than 127), 

260 # we need to do more complicated checks using all our multi-byte and 

261 # single-byte probers that are left. The single-byte probers 

262 # use character bigram distributions to determine the encoding, whereas 

263 # the multi-byte probers use a combination of character unigram and 

264 # bigram distributions. 

265 elif self._input_state == InputState.HIGH_BYTE: 

266 if not self._charset_probers: 

267 self._charset_probers = [MBCSGroupProber(self.lang_filter)] 

268 # If we're checking non-CJK encodings, use single-byte prober 

269 if self.lang_filter & LanguageFilter.NON_CJK: 

270 self._charset_probers.append(SBCSGroupProber()) 

271 self._charset_probers.append(Latin1Prober()) 

272 self._charset_probers.append(MacRomanProber()) 

273 for prober in self._charset_probers: 

274 if prober.feed(byte_str) == ProbingState.FOUND_IT: 

275 self.result = { 

276 "encoding": prober.charset_name, 

277 "confidence": prober.get_confidence(), 

278 "language": prober.language, 

279 } 

280 self.done = True 

281 break 

282 if self.WIN_BYTE_DETECTOR.search(byte_str): 

283 self._has_win_bytes = True 

284 

285 def close(self) -> ResultDict: 

286 """ 

287 Stop analyzing the current document and come up with a final 

288 prediction. 

289 

290 :returns: The ``result`` attribute, a ``dict`` with the keys 

291 `encoding`, `confidence`, and `language`. 

292 """ 

293 # Don't bother with checks if we're already done 

294 if self.done: 

295 return self.result 

296 self.done = True 

297 

298 if not self._got_data: 

299 self.logger.debug("no data received!") 

300 

301 # Default to ASCII if it is all we've seen so far 

302 elif self._input_state == InputState.PURE_ASCII: 

303 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""} 

304 

305 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 

306 elif self._input_state == InputState.HIGH_BYTE: 

307 prober_confidence = None 

308 max_prober_confidence = 0.0 

309 max_prober = None 

310 for prober in self._charset_probers: 

311 if not prober: 

312 continue 

313 prober_confidence = prober.get_confidence() 

314 if prober_confidence > max_prober_confidence: 

315 max_prober_confidence = prober_confidence 

316 max_prober = prober 

317 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 

318 charset_name = max_prober.charset_name 

319 assert charset_name is not None 

320 lower_charset_name = charset_name.lower() 

321 confidence = max_prober.get_confidence() 

322 # Use Windows encoding name instead of ISO-8859 if we saw any 

323 # extra Windows-specific bytes 

324 if lower_charset_name.startswith("iso-8859"): 

325 if self._has_win_bytes: 

326 charset_name = self.ISO_WIN_MAP.get( 

327 lower_charset_name, charset_name 

328 ) 

329 # Rename legacy encodings with superset encodings if asked 

330 if self.should_rename_legacy: 

331 charset_name = self.LEGACY_MAP.get( 

332 (charset_name or "").lower(), charset_name 

333 ) 

334 self.result = { 

335 "encoding": charset_name, 

336 "confidence": confidence, 

337 "language": max_prober.language, 

338 } 

339 

340 # Log all prober confidences if none met MINIMUM_THRESHOLD 

341 if self.logger.getEffectiveLevel() <= logging.DEBUG: 

342 if self.result["encoding"] is None: 

343 self.logger.debug("no probers hit minimum threshold") 

344 for group_prober in self._charset_probers: 

345 if not group_prober: 

346 continue 

347 if isinstance(group_prober, CharSetGroupProber): 

348 for prober in group_prober.probers: 

349 self.logger.debug( 

350 "%s %s confidence = %s", 

351 prober.charset_name, 

352 prober.language, 

353 prober.get_confidence(), 

354 ) 

355 else: 

356 self.logger.debug( 

357 "%s %s confidence = %s", 

358 group_prober.charset_name, 

359 group_prober.language, 

360 group_prober.get_confidence(), 

361 ) 

362 return self.result