Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/universaldetector.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

155 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, see 

25# <https://www.gnu.org/licenses/>. 

26######################### END LICENSE BLOCK ######################### 

27""" 

28Module containing the UniversalDetector detector class, which is the primary 

29class a user of ``chardet`` should use. 

30 

31:author: Mark Pilgrim (initial port to Python) 

32:author: Shy Shalom (original C code) 

33:author: Dan Blanchard (major refactoring for 3.0) 

34:author: Ian Cordasco 

35""" 

36 

37import codecs 

38import logging 

39import re 

40from typing import List, Optional, Union 

41 

42from .charsetgroupprober import CharSetGroupProber 

43from .charsetprober import CharSetProber 

44from .enums import InputState, LanguageFilter, ProbingState 

45from .escprober import EscCharSetProber 

46from .latin1prober import Latin1Prober 

47from .macromanprober import MacRomanProber 

48from .mbcsgroupprober import MBCSGroupProber 

49from .resultdict import ResultDict 

50from .sbcsgroupprober import SBCSGroupProber 

51from .utf1632prober import UTF1632Prober 

52 

53 

54class UniversalDetector: 

55 """ 

56 The ``UniversalDetector`` class underlies the ``chardet.detect`` function 

57 and coordinates all of the different charset probers. 

58 

59 To get a ``dict`` containing an encoding and its confidence, you can simply 

60 run: 

61 

62 .. code:: 

63 

64 u = UniversalDetector() 

65 u.feed(some_bytes) 

66 u.close() 

67 detected = u.result 

68 

69 """ 

70 

71 MINIMUM_THRESHOLD = 0.20 

72 HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xff]") 

73 ESC_DETECTOR = re.compile(b"(\033|~{)") 

74 WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]") 

75 ISO_WIN_MAP = { 

76 "iso-8859-1": "Windows-1252", 

77 "iso-8859-2": "Windows-1250", 

78 "iso-8859-5": "Windows-1251", 

79 "iso-8859-6": "Windows-1256", 

80 "iso-8859-7": "Windows-1253", 

81 "iso-8859-8": "Windows-1255", 

82 "iso-8859-9": "Windows-1254", 

83 "iso-8859-13": "Windows-1257", 

84 } 

85 # Based on https://encoding.spec.whatwg.org/#names-and-labels 

86 # but altered to match Python names for encodings and remove mappings 

87 # that break tests. 

88 LEGACY_MAP = { 

89 "ascii": "Windows-1252", 

90 "iso-8859-1": "Windows-1252", 

91 "tis-620": "ISO-8859-11", 

92 "iso-8859-9": "Windows-1254", 

93 "gb2312": "GB18030", 

94 "euc-kr": "CP949", 

95 "utf-16le": "UTF-16", 

96 } 

97 

98 def __init__( 

99 self, 

100 lang_filter: LanguageFilter = LanguageFilter.ALL, 

101 should_rename_legacy: bool = False, 

102 ) -> None: 

103 self._esc_charset_prober: Optional[EscCharSetProber] = None 

104 self._utf1632_prober: Optional[UTF1632Prober] = None 

105 self._charset_probers: List[CharSetProber] = [] 

106 self.result: ResultDict = { 

107 "encoding": None, 

108 "confidence": 0.0, 

109 "language": None, 

110 } 

111 self.done = False 

112 self._got_data = False 

113 self._input_state = InputState.PURE_ASCII 

114 self._last_char = b"" 

115 self.lang_filter = lang_filter 

116 self.logger = logging.getLogger(__name__) 

117 self._has_win_bytes = False 

118 self.should_rename_legacy = should_rename_legacy 

119 self.reset() 

120 

121 @property 

122 def input_state(self) -> int: 

123 return self._input_state 

124 

125 @property 

126 def has_win_bytes(self) -> bool: 

127 return self._has_win_bytes 

128 

129 @property 

130 def charset_probers(self) -> List[CharSetProber]: 

131 return self._charset_probers 

132 

133 def reset(self) -> None: 

134 """ 

135 Reset the UniversalDetector and all of its probers back to their 

136 initial states. This is called by ``__init__``, so you only need to 

137 call this directly in between analyses of different documents. 

138 """ 

139 self.result = {"encoding": None, "confidence": 0.0, "language": None} 

140 self.done = False 

141 self._got_data = False 

142 self._has_win_bytes = False 

143 self._input_state = InputState.PURE_ASCII 

144 self._last_char = b"" 

145 if self._esc_charset_prober: 

146 self._esc_charset_prober.reset() 

147 if self._utf1632_prober: 

148 self._utf1632_prober.reset() 

149 for prober in self._charset_probers: 

150 prober.reset() 

151 

152 def feed(self, byte_str: Union[bytes, bytearray]) -> None: 

153 """ 

154 Takes a chunk of a document and feeds it through all of the relevant 

155 charset probers. 

156 

157 After calling ``feed``, you can check the value of the ``done`` 

158 attribute to see if you need to continue feeding the 

159 ``UniversalDetector`` more data, or if it has made a prediction 

160 (in the ``result`` attribute). 

161 

162 .. note:: 

163 You should always call ``close`` when you're done feeding in your 

164 document if ``done`` is not already ``True``. 

165 """ 

166 if self.done: 

167 return 

168 

169 if not byte_str: 

170 return 

171 

172 if not isinstance(byte_str, bytearray): 

173 byte_str = bytearray(byte_str) 

174 

175 # First check for known BOMs, since these are guaranteed to be correct 

176 if not self._got_data: 

177 # If the data starts with BOM, we know it is UTF 

178 if byte_str.startswith(codecs.BOM_UTF8): 

179 # EF BB BF UTF-8 with BOM 

180 self.result = { 

181 "encoding": "UTF-8-SIG", 

182 "confidence": 1.0, 

183 "language": "", 

184 } 

185 elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): 

186 # FF FE 00 00 UTF-32, little-endian BOM 

187 # 00 00 FE FF UTF-32, big-endian BOM 

188 self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""} 

189 elif byte_str.startswith(b"\xfe\xff\x00\x00"): 

190 # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 

191 self.result = { 

192 # TODO: This encoding is not supported by Python. Should remove? 

193 "encoding": "X-ISO-10646-UCS-4-3412", 

194 "confidence": 1.0, 

195 "language": "", 

196 } 

197 elif byte_str.startswith(b"\x00\x00\xff\xfe"): 

198 # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 

199 self.result = { 

200 # TODO: This encoding is not supported by Python. Should remove? 

201 "encoding": "X-ISO-10646-UCS-4-2143", 

202 "confidence": 1.0, 

203 "language": "", 

204 } 

205 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 

206 # FF FE UTF-16, little endian BOM 

207 # FE FF UTF-16, big endian BOM 

208 self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""} 

209 

210 self._got_data = True 

211 if self.result["encoding"] is not None: 

212 self.done = True 

213 return 

214 

215 # If none of those matched and we've only see ASCII so far, check 

216 # for high bytes and escape sequences 

217 if self._input_state == InputState.PURE_ASCII: 

218 if self.HIGH_BYTE_DETECTOR.search(byte_str): 

219 self._input_state = InputState.HIGH_BYTE 

220 elif ( 

221 self._input_state == InputState.PURE_ASCII 

222 and self.ESC_DETECTOR.search(self._last_char + byte_str) 

223 ): 

224 self._input_state = InputState.ESC_ASCII 

225 

226 self._last_char = byte_str[-1:] 

227 

228 # next we will look to see if it is appears to be either a UTF-16 or 

229 # UTF-32 encoding 

230 if not self._utf1632_prober: 

231 self._utf1632_prober = UTF1632Prober() 

232 

233 if self._utf1632_prober.state == ProbingState.DETECTING: 

234 if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT: 

235 self.result = { 

236 "encoding": self._utf1632_prober.charset_name, 

237 "confidence": self._utf1632_prober.get_confidence(), 

238 "language": "", 

239 } 

240 self.done = True 

241 return 

242 

243 # If we've seen escape sequences, use the EscCharSetProber, which 

244 # uses a simple state machine to check for known escape sequences in 

245 # HZ and ISO-2022 encodings, since those are the only encodings that 

246 # use such sequences. 

247 if self._input_state == InputState.ESC_ASCII: 

248 if not self._esc_charset_prober: 

249 self._esc_charset_prober = EscCharSetProber(self.lang_filter) 

250 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 

251 self.result = { 

252 "encoding": self._esc_charset_prober.charset_name, 

253 "confidence": self._esc_charset_prober.get_confidence(), 

254 "language": self._esc_charset_prober.language, 

255 } 

256 self.done = True 

257 # If we've seen high bytes (i.e., those with values greater than 127), 

258 # we need to do more complicated checks using all our multi-byte and 

259 # single-byte probers that are left. The single-byte probers 

260 # use character bigram distributions to determine the encoding, whereas 

261 # the multi-byte probers use a combination of character unigram and 

262 # bigram distributions. 

263 elif self._input_state == InputState.HIGH_BYTE: 

264 if not self._charset_probers: 

265 self._charset_probers = [MBCSGroupProber(self.lang_filter)] 

266 # If we're checking non-CJK encodings, use single-byte prober 

267 if self.lang_filter & LanguageFilter.NON_CJK: 

268 self._charset_probers.append(SBCSGroupProber()) 

269 self._charset_probers.append(Latin1Prober()) 

270 self._charset_probers.append(MacRomanProber()) 

271 for prober in self._charset_probers: 

272 if prober.feed(byte_str) == ProbingState.FOUND_IT: 

273 self.result = { 

274 "encoding": prober.charset_name, 

275 "confidence": prober.get_confidence(), 

276 "language": prober.language, 

277 } 

278 self.done = True 

279 break 

280 if self.WIN_BYTE_DETECTOR.search(byte_str): 

281 self._has_win_bytes = True 

282 

283 def close(self) -> ResultDict: 

284 """ 

285 Stop analyzing the current document and come up with a final 

286 prediction. 

287 

288 :returns: The ``result`` attribute, a ``dict`` with the keys 

289 `encoding`, `confidence`, and `language`. 

290 """ 

291 # Don't bother with checks if we're already done 

292 if self.done: 

293 return self.result 

294 self.done = True 

295 

296 if not self._got_data: 

297 self.logger.debug("no data received!") 

298 

299 # Default to ASCII if it is all we've seen so far 

300 elif self._input_state == InputState.PURE_ASCII: 

301 self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""} 

302 

303 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 

304 elif self._input_state == InputState.HIGH_BYTE: 

305 prober_confidence = None 

306 max_prober_confidence = 0.0 

307 max_prober = None 

308 for prober in self._charset_probers: 

309 if not prober: 

310 continue 

311 prober_confidence = prober.get_confidence() 

312 if prober_confidence > max_prober_confidence: 

313 max_prober_confidence = prober_confidence 

314 max_prober = prober 

315 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 

316 charset_name = max_prober.charset_name 

317 assert charset_name is not None 

318 lower_charset_name = charset_name.lower() 

319 confidence = max_prober.get_confidence() 

320 # Use Windows encoding name instead of ISO-8859 if we saw any 

321 # extra Windows-specific bytes 

322 if lower_charset_name.startswith("iso-8859"): 

323 if self._has_win_bytes: 

324 charset_name = self.ISO_WIN_MAP.get( 

325 lower_charset_name, charset_name 

326 ) 

327 # Rename legacy encodings with superset encodings if asked 

328 if self.should_rename_legacy: 

329 charset_name = self.LEGACY_MAP.get( 

330 (charset_name or "").lower(), charset_name 

331 ) 

332 self.result = { 

333 "encoding": charset_name, 

334 "confidence": confidence, 

335 "language": max_prober.language, 

336 } 

337 

338 # Log all prober confidences if none met MINIMUM_THRESHOLD 

339 if self.logger.getEffectiveLevel() <= logging.DEBUG: 

340 if self.result["encoding"] is None: 

341 self.logger.debug("no probers hit minimum threshold") 

342 for group_prober in self._charset_probers: 

343 if not group_prober: 

344 continue 

345 if isinstance(group_prober, CharSetGroupProber): 

346 for prober in group_prober.probers: 

347 self.logger.debug( 

348 "%s %s confidence = %s", 

349 prober.charset_name, 

350 prober.language, 

351 prober.get_confidence(), 

352 ) 

353 else: 

354 self.logger.debug( 

355 "%s %s confidence = %s", 

356 group_prober.charset_name, 

357 group_prober.language, 

358 group_prober.get_confidence(), 

359 ) 

360 return self.result