Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/models.py: 37%

174 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:37 +0000

1from encodings.aliases import aliases 

2from hashlib import sha256 

3from json import dumps 

4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union 

5 

6from .constant import TOO_BIG_SEQUENCE 

7from .utils import iana_name, is_multi_byte_encoding, unicode_range 

8 

9 

10class CharsetMatch: 

11 def __init__( 

12 self, 

13 payload: bytes, 

14 guessed_encoding: str, 

15 mean_mess_ratio: float, 

16 has_sig_or_bom: bool, 

17 languages: "CoherenceMatches", 

18 decoded_payload: Optional[str] = None, 

19 ): 

20 self._payload: bytes = payload 

21 

22 self._encoding: str = guessed_encoding 

23 self._mean_mess_ratio: float = mean_mess_ratio 

24 self._languages: CoherenceMatches = languages 

25 self._has_sig_or_bom: bool = has_sig_or_bom 

26 self._unicode_ranges: Optional[List[str]] = None 

27 

28 self._leaves: List[CharsetMatch] = [] 

29 self._mean_coherence_ratio: float = 0.0 

30 

31 self._output_payload: Optional[bytes] = None 

32 self._output_encoding: Optional[str] = None 

33 

34 self._string: Optional[str] = decoded_payload 

35 

36 def __eq__(self, other: object) -> bool: 

37 if not isinstance(other, CharsetMatch): 

38 raise TypeError( 

39 "__eq__ cannot be invoked on {} and {}.".format( 

40 str(other.__class__), str(self.__class__) 

41 ) 

42 ) 

43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

44 

45 def __lt__(self, other: object) -> bool: 

46 """ 

47 Implemented to make sorted available upon CharsetMatches items. 

48 """ 

49 if not isinstance(other, CharsetMatch): 

50 raise ValueError 

51 

52 chaos_difference: float = abs(self.chaos - other.chaos) 

53 coherence_difference: float = abs(self.coherence - other.coherence) 

54 

55 # Below 1% difference --> Use Coherence 

56 if chaos_difference < 0.01 and coherence_difference > 0.02: 

57 # When having a tough decision, use the result that decoded as many multi-byte as possible. 

58 if chaos_difference == 0.0 and self.coherence == other.coherence: 

59 return self.multi_byte_usage > other.multi_byte_usage 

60 return self.coherence > other.coherence 

61 

62 return self.chaos < other.chaos 

63 

64 @property 

65 def multi_byte_usage(self) -> float: 

66 return 1.0 - len(str(self)) / len(self.raw) 

67 

68 def __str__(self) -> str: 

69 # Lazy Str Loading 

70 if self._string is None: 

71 self._string = str(self._payload, self._encoding, "strict") 

72 return self._string 

73 

74 def __repr__(self) -> str: 

75 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint) 

76 

77 def add_submatch(self, other: "CharsetMatch") -> None: 

78 if not isinstance(other, CharsetMatch) or other == self: 

79 raise ValueError( 

80 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

81 other.__class__ 

82 ) 

83 ) 

84 

85 other._string = None # Unload RAM usage; dirty trick. 

86 self._leaves.append(other) 

87 

88 @property 

89 def encoding(self) -> str: 

90 return self._encoding 

91 

92 @property 

93 def encoding_aliases(self) -> List[str]: 

94 """ 

95 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

96 """ 

97 also_known_as: List[str] = [] 

98 for u, p in aliases.items(): 

99 if self.encoding == u: 

100 also_known_as.append(p) 

101 elif self.encoding == p: 

102 also_known_as.append(u) 

103 return also_known_as 

104 

105 @property 

106 def bom(self) -> bool: 

107 return self._has_sig_or_bom 

108 

109 @property 

110 def byte_order_mark(self) -> bool: 

111 return self._has_sig_or_bom 

112 

113 @property 

114 def languages(self) -> List[str]: 

115 """ 

116 Return the complete list of possible languages found in decoded sequence. 

117 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

118 """ 

119 return [e[0] for e in self._languages] 

120 

121 @property 

122 def language(self) -> str: 

123 """ 

124 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

125 "Unknown". 

126 """ 

127 if not self._languages: 

128 # Trying to infer the language based on the given encoding 

129 # Its either English or we should not pronounce ourselves in certain cases. 

130 if "ascii" in self.could_be_from_charset: 

131 return "English" 

132 

133 # doing it there to avoid circular import 

134 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

135 

136 languages = ( 

137 mb_encoding_languages(self.encoding) 

138 if is_multi_byte_encoding(self.encoding) 

139 else encoding_languages(self.encoding) 

140 ) 

141 

142 if len(languages) == 0 or "Latin Based" in languages: 

143 return "Unknown" 

144 

145 return languages[0] 

146 

147 return self._languages[0][0] 

148 

149 @property 

150 def chaos(self) -> float: 

151 return self._mean_mess_ratio 

152 

153 @property 

154 def coherence(self) -> float: 

155 if not self._languages: 

156 return 0.0 

157 return self._languages[0][1] 

158 

159 @property 

160 def percent_chaos(self) -> float: 

161 return round(self.chaos * 100, ndigits=3) 

162 

163 @property 

164 def percent_coherence(self) -> float: 

165 return round(self.coherence * 100, ndigits=3) 

166 

167 @property 

168 def raw(self) -> bytes: 

169 """ 

170 Original untouched bytes. 

171 """ 

172 return self._payload 

173 

174 @property 

175 def submatch(self) -> List["CharsetMatch"]: 

176 return self._leaves 

177 

178 @property 

179 def has_submatch(self) -> bool: 

180 return len(self._leaves) > 0 

181 

182 @property 

183 def alphabets(self) -> List[str]: 

184 if self._unicode_ranges is not None: 

185 return self._unicode_ranges 

186 # list detected ranges 

187 detected_ranges: List[Optional[str]] = [ 

188 unicode_range(char) for char in str(self) 

189 ] 

190 # filter and sort 

191 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

192 return self._unicode_ranges 

193 

194 @property 

195 def could_be_from_charset(self) -> List[str]: 

196 """ 

197 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

198 encoding. 

199 This list does include the encoding available in property 'encoding'. 

200 """ 

201 return [self._encoding] + [m.encoding for m in self._leaves] 

202 

203 def output(self, encoding: str = "utf_8") -> bytes: 

204 """ 

205 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

206 Any errors will be simply ignored by the encoder NOT replaced. 

207 """ 

208 if self._output_encoding is None or self._output_encoding != encoding: 

209 self._output_encoding = encoding 

210 self._output_payload = str(self).encode(encoding, "replace") 

211 

212 return self._output_payload # type: ignore 

213 

214 @property 

215 def fingerprint(self) -> str: 

216 """ 

217 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 

218 """ 

219 return sha256(self.output()).hexdigest() 

220 

221 

222class CharsetMatches: 

223 """ 

224 Container with every CharsetMatch items ordered by default from most probable to the less one. 

225 Act like a list(iterable) but does not implements all related methods. 

226 """ 

227 

228 def __init__(self, results: Optional[List[CharsetMatch]] = None): 

229 self._results: List[CharsetMatch] = sorted(results) if results else [] 

230 

231 def __iter__(self) -> Iterator[CharsetMatch]: 

232 yield from self._results 

233 

234 def __getitem__(self, item: Union[int, str]) -> CharsetMatch: 

235 """ 

236 Retrieve a single item either by its position or encoding name (alias may be used here). 

237 Raise KeyError upon invalid index or encoding not present in results. 

238 """ 

239 if isinstance(item, int): 

240 return self._results[item] 

241 if isinstance(item, str): 

242 item = iana_name(item, False) 

243 for result in self._results: 

244 if item in result.could_be_from_charset: 

245 return result 

246 raise KeyError 

247 

248 def __len__(self) -> int: 

249 return len(self._results) 

250 

251 def __bool__(self) -> bool: 

252 return len(self._results) > 0 

253 

254 def append(self, item: CharsetMatch) -> None: 

255 """ 

256 Insert a single match. Will be inserted accordingly to preserve sort. 

257 Can be inserted as a submatch. 

258 """ 

259 if not isinstance(item, CharsetMatch): 

260 raise ValueError( 

261 "Cannot append instance '{}' to CharsetMatches".format( 

262 str(item.__class__) 

263 ) 

264 ) 

265 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

266 if len(item.raw) <= TOO_BIG_SEQUENCE: 

267 for match in self._results: 

268 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

269 match.add_submatch(item) 

270 return 

271 self._results.append(item) 

272 self._results = sorted(self._results) 

273 

274 def best(self) -> Optional["CharsetMatch"]: 

275 """ 

276 Simply return the first match. Strict equivalent to matches[0]. 

277 """ 

278 if not self._results: 

279 return None 

280 return self._results[0] 

281 

282 def first(self) -> Optional["CharsetMatch"]: 

283 """ 

284 Redundant method, call the method best(). Kept for BC reasons. 

285 """ 

286 return self.best() 

287 

288 

289CoherenceMatch = Tuple[str, float] 

290CoherenceMatches = List[CoherenceMatch] 

291 

292 

293class CliDetectionResult: 

294 def __init__( 

295 self, 

296 path: str, 

297 encoding: Optional[str], 

298 encoding_aliases: List[str], 

299 alternative_encodings: List[str], 

300 language: str, 

301 alphabets: List[str], 

302 has_sig_or_bom: bool, 

303 chaos: float, 

304 coherence: float, 

305 unicode_path: Optional[str], 

306 is_preferred: bool, 

307 ): 

308 self.path: str = path 

309 self.unicode_path: Optional[str] = unicode_path 

310 self.encoding: Optional[str] = encoding 

311 self.encoding_aliases: List[str] = encoding_aliases 

312 self.alternative_encodings: List[str] = alternative_encodings 

313 self.language: str = language 

314 self.alphabets: List[str] = alphabets 

315 self.has_sig_or_bom: bool = has_sig_or_bom 

316 self.chaos: float = chaos 

317 self.coherence: float = coherence 

318 self.is_preferred: bool = is_preferred 

319 

320 @property 

321 def __dict__(self) -> Dict[str, Any]: # type: ignore 

322 return { 

323 "path": self.path, 

324 "encoding": self.encoding, 

325 "encoding_aliases": self.encoding_aliases, 

326 "alternative_encodings": self.alternative_encodings, 

327 "language": self.language, 

328 "alphabets": self.alphabets, 

329 "has_sig_or_bom": self.has_sig_or_bom, 

330 "chaos": self.chaos, 

331 "coherence": self.coherence, 

332 "unicode_path": self.unicode_path, 

333 "is_preferred": self.is_preferred, 

334 } 

335 

336 def to_json(self) -> str: 

337 return dumps(self.__dict__, ensure_ascii=True, indent=4)