Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/models.py: 69%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

178 statements  

1from encodings.aliases import aliases 

2from hashlib import sha256 

3from json import dumps 

4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union 

5 

6from .constant import TOO_BIG_SEQUENCE 

7from .utils import iana_name, is_multi_byte_encoding, unicode_range 

8 

9 

10class CharsetMatch: 

11 def __init__( 

12 self, 

13 payload: bytes, 

14 guessed_encoding: str, 

15 mean_mess_ratio: float, 

16 has_sig_or_bom: bool, 

17 languages: "CoherenceMatches", 

18 decoded_payload: Optional[str] = None, 

19 ): 

20 self._payload: bytes = payload 

21 

22 self._encoding: str = guessed_encoding 

23 self._mean_mess_ratio: float = mean_mess_ratio 

24 self._languages: CoherenceMatches = languages 

25 self._has_sig_or_bom: bool = has_sig_or_bom 

26 self._unicode_ranges: Optional[List[str]] = None 

27 

28 self._leaves: List[CharsetMatch] = [] 

29 self._mean_coherence_ratio: float = 0.0 

30 

31 self._output_payload: Optional[bytes] = None 

32 self._output_encoding: Optional[str] = None 

33 

34 self._string: Optional[str] = decoded_payload 

35 

36 def __eq__(self, other: object) -> bool: 

37 if not isinstance(other, CharsetMatch): 

38 if isinstance(other, str): 

39 return iana_name(other) == self.encoding 

40 return False 

41 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

42 

43 def __lt__(self, other: object) -> bool: 

44 """ 

45 Implemented to make sorted available upon CharsetMatches items. 

46 """ 

47 if not isinstance(other, CharsetMatch): 

48 raise ValueError 

49 

50 chaos_difference: float = abs(self.chaos - other.chaos) 

51 coherence_difference: float = abs(self.coherence - other.coherence) 

52 

53 # Below 1% difference --> Use Coherence 

54 if chaos_difference < 0.01 and coherence_difference > 0.02: 

55 return self.coherence > other.coherence 

56 elif chaos_difference < 0.01 and coherence_difference <= 0.02: 

57 # When having a difficult decision, use the result that decoded as many multi-byte as possible. 

58 # preserve RAM usage! 

59 if len(self._payload) >= TOO_BIG_SEQUENCE: 

60 return self.chaos < other.chaos 

61 return self.multi_byte_usage > other.multi_byte_usage 

62 

63 return self.chaos < other.chaos 

64 

65 @property 

66 def multi_byte_usage(self) -> float: 

67 return 1.0 - (len(str(self)) / len(self.raw)) 

68 

69 def __str__(self) -> str: 

70 # Lazy Str Loading 

71 if self._string is None: 

72 self._string = str(self._payload, self._encoding, "strict") 

73 return self._string 

74 

75 def __repr__(self) -> str: 

76 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint) 

77 

78 def add_submatch(self, other: "CharsetMatch") -> None: 

79 if not isinstance(other, CharsetMatch) or other == self: 

80 raise ValueError( 

81 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

82 other.__class__ 

83 ) 

84 ) 

85 

86 other._string = None # Unload RAM usage; dirty trick. 

87 self._leaves.append(other) 

88 

89 @property 

90 def encoding(self) -> str: 

91 return self._encoding 

92 

93 @property 

94 def encoding_aliases(self) -> List[str]: 

95 """ 

96 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

97 """ 

98 also_known_as: List[str] = [] 

99 for u, p in aliases.items(): 

100 if self.encoding == u: 

101 also_known_as.append(p) 

102 elif self.encoding == p: 

103 also_known_as.append(u) 

104 return also_known_as 

105 

106 @property 

107 def bom(self) -> bool: 

108 return self._has_sig_or_bom 

109 

110 @property 

111 def byte_order_mark(self) -> bool: 

112 return self._has_sig_or_bom 

113 

114 @property 

115 def languages(self) -> List[str]: 

116 """ 

117 Return the complete list of possible languages found in decoded sequence. 

118 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

119 """ 

120 return [e[0] for e in self._languages] 

121 

122 @property 

123 def language(self) -> str: 

124 """ 

125 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

126 "Unknown". 

127 """ 

128 if not self._languages: 

129 # Trying to infer the language based on the given encoding 

130 # Its either English or we should not pronounce ourselves in certain cases. 

131 if "ascii" in self.could_be_from_charset: 

132 return "English" 

133 

134 # doing it there to avoid circular import 

135 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

136 

137 languages = ( 

138 mb_encoding_languages(self.encoding) 

139 if is_multi_byte_encoding(self.encoding) 

140 else encoding_languages(self.encoding) 

141 ) 

142 

143 if len(languages) == 0 or "Latin Based" in languages: 

144 return "Unknown" 

145 

146 return languages[0] 

147 

148 return self._languages[0][0] 

149 

150 @property 

151 def chaos(self) -> float: 

152 return self._mean_mess_ratio 

153 

154 @property 

155 def coherence(self) -> float: 

156 if not self._languages: 

157 return 0.0 

158 return self._languages[0][1] 

159 

160 @property 

161 def percent_chaos(self) -> float: 

162 return round(self.chaos * 100, ndigits=3) 

163 

164 @property 

165 def percent_coherence(self) -> float: 

166 return round(self.coherence * 100, ndigits=3) 

167 

168 @property 

169 def raw(self) -> bytes: 

170 """ 

171 Original untouched bytes. 

172 """ 

173 return self._payload 

174 

175 @property 

176 def submatch(self) -> List["CharsetMatch"]: 

177 return self._leaves 

178 

179 @property 

180 def has_submatch(self) -> bool: 

181 return len(self._leaves) > 0 

182 

183 @property 

184 def alphabets(self) -> List[str]: 

185 if self._unicode_ranges is not None: 

186 return self._unicode_ranges 

187 # list detected ranges 

188 detected_ranges: List[Optional[str]] = [ 

189 unicode_range(char) for char in str(self) 

190 ] 

191 # filter and sort 

192 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

193 return self._unicode_ranges 

194 

195 @property 

196 def could_be_from_charset(self) -> List[str]: 

197 """ 

198 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

199 encoding. 

200 This list does include the encoding available in property 'encoding'. 

201 """ 

202 return [self._encoding] + [m.encoding for m in self._leaves] 

203 

204 def output(self, encoding: str = "utf_8") -> bytes: 

205 """ 

206 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

207 Any errors will be simply ignored by the encoder NOT replaced. 

208 """ 

209 if self._output_encoding is None or self._output_encoding != encoding: 

210 self._output_encoding = encoding 

211 self._output_payload = str(self).encode(encoding, "replace") 

212 

213 return self._output_payload # type: ignore 

214 

215 @property 

216 def fingerprint(self) -> str: 

217 """ 

218 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 

219 """ 

220 return sha256(self.output()).hexdigest() 

221 

222 

223class CharsetMatches: 

224 """ 

225 Container with every CharsetMatch items ordered by default from most probable to the less one. 

226 Act like a list(iterable) but does not implements all related methods. 

227 """ 

228 

229 def __init__(self, results: Optional[List[CharsetMatch]] = None): 

230 self._results: List[CharsetMatch] = sorted(results) if results else [] 

231 

232 def __iter__(self) -> Iterator[CharsetMatch]: 

233 yield from self._results 

234 

235 def __getitem__(self, item: Union[int, str]) -> CharsetMatch: 

236 """ 

237 Retrieve a single item either by its position or encoding name (alias may be used here). 

238 Raise KeyError upon invalid index or encoding not present in results. 

239 """ 

240 if isinstance(item, int): 

241 return self._results[item] 

242 if isinstance(item, str): 

243 item = iana_name(item, False) 

244 for result in self._results: 

245 if item in result.could_be_from_charset: 

246 return result 

247 raise KeyError 

248 

249 def __len__(self) -> int: 

250 return len(self._results) 

251 

252 def __bool__(self) -> bool: 

253 return len(self._results) > 0 

254 

255 def append(self, item: CharsetMatch) -> None: 

256 """ 

257 Insert a single match. Will be inserted accordingly to preserve sort. 

258 Can be inserted as a submatch. 

259 """ 

260 if not isinstance(item, CharsetMatch): 

261 raise ValueError( 

262 "Cannot append instance '{}' to CharsetMatches".format( 

263 str(item.__class__) 

264 ) 

265 ) 

266 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

267 if len(item.raw) <= TOO_BIG_SEQUENCE: 

268 for match in self._results: 

269 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

270 match.add_submatch(item) 

271 return 

272 self._results.append(item) 

273 self._results = sorted(self._results) 

274 

275 def best(self) -> Optional["CharsetMatch"]: 

276 """ 

277 Simply return the first match. Strict equivalent to matches[0]. 

278 """ 

279 if not self._results: 

280 return None 

281 return self._results[0] 

282 

283 def first(self) -> Optional["CharsetMatch"]: 

284 """ 

285 Redundant method, call the method best(). Kept for BC reasons. 

286 """ 

287 return self.best() 

288 

289 

290CoherenceMatch = Tuple[str, float] 

291CoherenceMatches = List[CoherenceMatch] 

292 

293 

294class CliDetectionResult: 

295 def __init__( 

296 self, 

297 path: str, 

298 encoding: Optional[str], 

299 encoding_aliases: List[str], 

300 alternative_encodings: List[str], 

301 language: str, 

302 alphabets: List[str], 

303 has_sig_or_bom: bool, 

304 chaos: float, 

305 coherence: float, 

306 unicode_path: Optional[str], 

307 is_preferred: bool, 

308 ): 

309 self.path: str = path 

310 self.unicode_path: Optional[str] = unicode_path 

311 self.encoding: Optional[str] = encoding 

312 self.encoding_aliases: List[str] = encoding_aliases 

313 self.alternative_encodings: List[str] = alternative_encodings 

314 self.language: str = language 

315 self.alphabets: List[str] = alphabets 

316 self.has_sig_or_bom: bool = has_sig_or_bom 

317 self.chaos: float = chaos 

318 self.coherence: float = coherence 

319 self.is_preferred: bool = is_preferred 

320 

321 @property 

322 def __dict__(self) -> Dict[str, Any]: # type: ignore 

323 return { 

324 "path": self.path, 

325 "encoding": self.encoding, 

326 "encoding_aliases": self.encoding_aliases, 

327 "alternative_encodings": self.alternative_encodings, 

328 "language": self.language, 

329 "alphabets": self.alphabets, 

330 "has_sig_or_bom": self.has_sig_or_bom, 

331 "chaos": self.chaos, 

332 "coherence": self.coherence, 

333 "unicode_path": self.unicode_path, 

334 "is_preferred": self.is_preferred, 

335 } 

336 

337 def to_json(self) -> str: 

338 return dumps(self.__dict__, ensure_ascii=True, indent=4)