Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/universaldetector.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

154 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28""" 

29Module containing the UniversalDetector detector class, which is the primary 

30class a user of ``chardet`` should use. 

31 

32:author: Mark Pilgrim (initial port to Python) 

33:author: Shy Shalom (original C code) 

34:author: Dan Blanchard (major refactoring for 3.0) 

35:author: Ian Cordasco 

36""" 

37 

38import codecs 

39import logging 

40import re 

41from typing import List, Optional, Union 

42 

43from .charsetgroupprober import CharSetGroupProber 

44from .charsetprober import CharSetProber 

45from .enums import InputState, LanguageFilter, ProbingState 

46from .escprober import EscCharSetProber 

47from .latin1prober import Latin1Prober 

48from .macromanprober import MacRomanProber 

49from .mbcsgroupprober import MBCSGroupProber 

50from .resultdict import ResultDict 

51from .sbcsgroupprober import SBCSGroupProber 

52from .utf1632prober import UTF1632Prober 

53 

54 

55class UniversalDetector: 

56 """ 

57 The ``UniversalDetector`` class underlies the ``chardet.detect`` function 

58 and coordinates all of the different charset probers. 

59 

60 To get a ``dict`` containing an encoding and its confidence, you can simply 

61 run: 

62 

63 .. code:: 

64 

65 u = UniversalDetector() 

66 u.feed(some_bytes) 

67 u.close() 

68 detected = u.result 

69 

70 """ 

71 

72 MINIMUM_THRESHOLD = 0.20 

73 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]") 

74 ESC_DETECTOR = re.compile(b"(\033|~{)") 

75 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]") 

76 ISO_WIN_MAP = { 

77 "iso-8859-1": "Windows-1252", 

78 "iso-8859-2": "Windows-1250", 

79 "iso-8859-5": "Windows-1251", 

80 "iso-8859-6": "Windows-1256", 

81 "iso-8859-7": "Windows-1253", 

82 "iso-8859-8": "Windows-1255", 

83 "iso-8859-9": "Windows-1254", 

84 "iso-8859-13": "Windows-1257", 

85 } 

86 # Based on https://encoding.spec.whatwg.org/#names-and-labels 

87 # but altered to match Python names for encodings and remove mappings 

88 # that break tests. 

89 LEGACY_MAP = { 

90 "ascii": "Windows-1252", 

91 "iso-8859-1": "Windows-1252", 

92 "tis-620": "ISO-8859-11", 

93 "iso-8859-9": "Windows-1254", 

94 "gb2312": "GB18030", 

95 "euc-kr": "CP949", 

96 "utf-16le": "UTF-16", 

97 } 

98 

99 def __init__( 

100 self, 

101 lang_filter: LanguageFilter = LanguageFilter.ALL, 

102 should_rename_legacy: bool = False, 

103 ) -> None: 

104 self._esc_charset_prober: Optional[EscCharSetProber] = None 

105 self._utf1632_prober: Optional[UTF1632Prober] = None 

106 self._charset_probers: List[CharSetProber] = [] 

107 self.result: ResultDict = { 

108 "encoding": None, 

109 "confidence": 0.0, 

110 "language": None, 

111 } 

112 self.done = False 

113 self._got_data = False 

114 self._input_state = InputState.PURE_ASCII 

115 self._last_char = b"" 

116 self.lang_filter = lang_filter 

117 self.logger = logging.getLogger(__name__) 

118 self._has_win_bytes = False 

119 self.should_rename_legacy = should_rename_legacy 

120 self.reset() 

121 

122 @property 

123 def input_state(self) -> int: 

124 return self._input_state 

125 

126 @property 

127 def has_win_bytes(self) -> bool: 

128 return self._has_win_bytes 

129 

130 @property 

131 def charset_probers(self) -> List[CharSetProber]: 

132 return self._charset_probers 

133 

134 def reset(self) -> None: 

135 """ 

136 Reset the UniversalDetector and all of its probers back to their 

137 initial states. This is called by ``__init__``, so you only need to 

138 call this directly in between analyses of different documents. 

139 """ 

140 self.result = {"encoding": None, "confidence": 0.0, "language": None} 

141 self.done = False 

142 self._got_data = False 

143 self._has_win_bytes = False 

144 self._input_state = InputState.PURE_ASCII 

145 self._last_char = b"" 

146 if self._esc_charset_prober: 

147 self._esc_charset_prober.reset() 

148 if self._utf1632_prober: 

149 self._utf1632_prober.reset() 

150 for prober in self._charset_probers: 

151 prober.reset() 

152 

153 def feed(self, byte_str: Union[bytes, bytearray]) -> None: 

154 """ 

155 Takes a chunk of a document and feeds it through all of the relevant 

156 charset probers. 

157 

158 After calling ``feed``, you can check the value of the ``done`` 

159 attribute to see if you need to continue feeding the 

160 ``UniversalDetector`` more data, or if it has made a prediction 

161 (in the ``result`` attribute). 

162 

163 .. note:: 

164 You should always call ``close`` when you're done feeding in your 

165 document if ``done`` is not already ``True``. 

166 """ 

167 if self.done: 

168 return 

169 

170 if not byte_str: 

171 return 

172 

173 if not isinstance(byte_str, bytearray): 

174 byte_str = bytearray(byte_str) 

175 

176 # First check for known BOMs, since these are guaranteed to be correct 

177 if not self._got_data: 

178 # If the data starts with BOM, we know it is UTF 

179 if byte_str.startswith(codecs.BOM_UTF8): 

180 # EF BB BF UTF-8 with BOM 

181 self.result = { 

182 "encoding": "UTF-8-SIG", 

183 "confidence": 1.0, 

184 "language": "", 

185 } 

186 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): 

187 # FF FE 00 00 UTF-32, little-endian BOM 

188 # 00 00 FE FF UTF-32, big-endian BOM 

189 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""} 

190 elif byte_str.startswith(b"\xfe\xff\x00\x00"): 

191 # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 

192 self.result = { 

193 # TODO: This encoding is not supported by Python. Should remove? 

194 "encoding": "X-ISO-10646-UCS-4-3412", 

195 "confidence": 1.0, 

196 "language": "", 

197 } 

198 elif byte_str.startswith(b"\x00\x00\xff\xfe"): 

199 # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 

200 self.result = { 

201 # TODO: This encoding is not supported by Python. Should remove? 

202 "encoding": "X-ISO-10646-UCS-4-2143", 

203 "confidence": 1.0, 

204 "language": "", 

205 } 

206 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 

207 # FF FE UTF-16, little endian BOM 

208 # FE FF UTF-16, big endian BOM 

209 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""} 

210 

211 self._got_data = True 

212 if self.result["encoding"] is not None: 

213 self.done = True 

214 return 

215 

216 # If none of those matched and we've only see ASCII so far, check 

217 # for high bytes and escape sequences 

218 if self._input_state == InputState.PURE_ASCII: 

219 if self.HIGH_BYTE_DETECTOR.search(byte_str): 

220 self._input_state = InputState.HIGH_BYTE 

221 elif ( 

222 self._input_state == InputState.PURE_ASCII 

223 and self.ESC_DETECTOR.search(self._last_char + byte_str) 

224 ): 

225 self._input_state = InputState.ESC_ASCII 

226 

227 self._last_char = byte_str[-1:] 

228 

229 # next we will look to see if it is appears to be either a UTF-16 or 

230 # UTF-32 encoding 

231 if not self._utf1632_prober: 

232 self._utf1632_prober = UTF1632Prober() 

233 

234 if self._utf1632_prober.state == ProbingState.DETECTING: 

235 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT: 

236 self.result = { 

237 "encoding": self._utf1632_prober.charset_name, 

238 "confidence": self._utf1632_prober.get_confidence(), 

239 "language": "", 

240 } 

241 self.done = True 

242 return 

243 

244 # If we've seen escape sequences, use the EscCharSetProber, which 

245 # uses a simple state machine to check for known escape sequences in 

246 # HZ and ISO-2022 encodings, since those are the only encodings that 

247 # use such sequences. 

248 if self._input_state == InputState.ESC_ASCII: 

249 if not self._esc_charset_prober: 

250 self._esc_charset_prober = EscCharSetProber(self.lang_filter) 

251 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 

252 self.result = { 

253 "encoding": self._esc_charset_prober.charset_name, 

254 "confidence": self._esc_charset_prober.get_confidence(), 

255 "language": self._esc_charset_prober.language, 

256 } 

257 self.done = True 

258 # If we've seen high bytes (i.e., those with values greater than 127), 

259 # we need to do more complicated checks using all our multi-byte and 

260 # single-byte probers that are left. The single-byte probers 

261 # use character bigram distributions to determine the encoding, whereas 

262 # the multi-byte probers use a combination of character unigram and 

263 # bigram distributions. 

264 elif self._input_state == InputState.HIGH_BYTE: 

265 if not self._charset_probers: 

266 self._charset_probers = [MBCSGroupProber(self.lang_filter)] 

267 # If we're checking non-CJK encodings, use single-byte prober 

268 if self.lang_filter & LanguageFilter.NON_CJK: 

269 self._charset_probers.append(SBCSGroupProber()) 

270 self._charset_probers.append(Latin1Prober()) 

271 self._charset_probers.append(MacRomanProber()) 

272 for prober in self._charset_probers: 

273 if prober.feed(byte_str) == ProbingState.FOUND_IT: 

274 self.result = { 

275 "encoding": prober.charset_name, 

276 "confidence": prober.get_confidence(), 

277 "language": prober.language, 

278 } 

279 self.done = True 

280 break 

281 if self.WIN_BYTE_DETECTOR.search(byte_str): 

282 self._has_win_bytes = True 

283 

284 def close(self) -> ResultDict: 

285 """ 

286 Stop analyzing the current document and come up with a final 

287 prediction. 

288 

289 :returns: The ``result`` attribute, a ``dict`` with the keys 

290 `encoding`, `confidence`, and `language`. 

291 """ 

292 # Don't bother with checks if we're already done 

293 if self.done: 

294 return self.result 

295 self.done = True 

296 

297 if not self._got_data: 

298 self.logger.debug("no data received!") 

299 

300 # Default to ASCII if it is all we've seen so far 

301 elif self._input_state == InputState.PURE_ASCII: 

302 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""} 

303 

304 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 

305 elif self._input_state == InputState.HIGH_BYTE: 

306 prober_confidence = None 

307 max_prober_confidence = 0.0 

308 max_prober = None 

309 for prober in self._charset_probers: 

310 if not prober: 

311 continue 

312 prober_confidence = prober.get_confidence() 

313 if prober_confidence > max_prober_confidence: 

314 max_prober_confidence = prober_confidence 

315 max_prober = prober 

316 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 

317 charset_name = max_prober.charset_name 

318 assert charset_name is not None 

319 lower_charset_name = charset_name.lower() 

320 confidence = max_prober.get_confidence() 

321 # Use Windows encoding name instead of ISO-8859 if we saw any 

322 # extra Windows-specific bytes 

323 if lower_charset_name.startswith("iso-8859"): 

324 if self._has_win_bytes: 

325 charset_name = self.ISO_WIN_MAP.get( 

326 lower_charset_name, charset_name 

327 ) 

328 # Rename legacy encodings with superset encodings if asked 

329 if self.should_rename_legacy: 

330 charset_name = self.LEGACY_MAP.get( 

331 (charset_name or "").lower(), charset_name 

332 ) 

333 self.result = { 

334 "encoding": charset_name, 

335 "confidence": confidence, 

336 "language": max_prober.language, 

337 } 

338 

339 # Log all prober confidences if none met MINIMUM_THRESHOLD 

340 if self.logger.getEffectiveLevel() <= logging.DEBUG: 

341 if self.result["encoding"] is None: 

342 self.logger.debug("no probers hit minimum threshold") 

343 for group_prober in self._charset_probers: 

344 if not group_prober: 

345 continue 

346 if isinstance(group_prober, CharSetGroupProber): 

347 for prober in group_prober.probers: 

348 self.logger.debug( 

349 "%s %s confidence = %s", 

350 prober.charset_name, 

351 prober.language, 

352 prober.get_confidence(), 

353 ) 

354 else: 

355 self.logger.debug( 

356 "%s %s confidence = %s", 

357 group_prober.charset_name, 

358 group_prober.language, 

359 group_prober.get_confidence(), 

360 ) 

361 return self.result