Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 36%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

185 statements  

1from __future__ import annotations 

2 

3from encodings.aliases import aliases 

4from hashlib import sha256 

5from json import dumps 

6from re import sub 

7from typing import Any, Iterator, List, Tuple 

8 

9from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE 

10from .utils import iana_name, is_multi_byte_encoding, unicode_range 

11 

12 

13class CharsetMatch: 

14 def __init__( 

15 self, 

16 payload: bytes, 

17 guessed_encoding: str, 

18 mean_mess_ratio: float, 

19 has_sig_or_bom: bool, 

20 languages: CoherenceMatches, 

21 decoded_payload: str | None = None, 

22 preemptive_declaration: str | None = None, 

23 ): 

24 self._payload: bytes = payload 

25 

26 self._encoding: str = guessed_encoding 

27 self._mean_mess_ratio: float = mean_mess_ratio 

28 self._languages: CoherenceMatches = languages 

29 self._has_sig_or_bom: bool = has_sig_or_bom 

30 self._unicode_ranges: list[str] | None = None 

31 

32 self._leaves: list[CharsetMatch] = [] 

33 self._mean_coherence_ratio: float = 0.0 

34 

35 self._output_payload: bytes | None = None 

36 self._output_encoding: str | None = None 

37 

38 self._string: str | None = decoded_payload 

39 

40 self._preemptive_declaration: str | None = preemptive_declaration 

41 

42 def __eq__(self, other: object) -> bool: 

43 if not isinstance(other, CharsetMatch): 

44 if isinstance(other, str): 

45 return iana_name(other) == self.encoding 

46 return False 

47 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

48 

49 def __lt__(self, other: object) -> bool: 

50 """ 

51 Implemented to make sorted available upon CharsetMatches items. 

52 """ 

53 if not isinstance(other, CharsetMatch): 

54 raise ValueError 

55 

56 chaos_difference: float = abs(self.chaos - other.chaos) 

57 coherence_difference: float = abs(self.coherence - other.coherence) 

58 

59 # Below 1% difference --> Use Coherence 

60 if chaos_difference < 0.01 and coherence_difference > 0.02: 

61 return self.coherence > other.coherence 

62 elif chaos_difference < 0.01 and coherence_difference <= 0.02: 

63 # When having a difficult decision, use the result that decoded as many multi-byte as possible. 

64 # preserve RAM usage! 

65 if len(self._payload) >= TOO_BIG_SEQUENCE: 

66 return self.chaos < other.chaos 

67 return self.multi_byte_usage > other.multi_byte_usage 

68 

69 return self.chaos < other.chaos 

70 

71 @property 

72 def multi_byte_usage(self) -> float: 

73 return 1.0 - (len(str(self)) / len(self.raw)) 

74 

75 def __str__(self) -> str: 

76 # Lazy Str Loading 

77 if self._string is None: 

78 self._string = str(self._payload, self._encoding, "strict") 

79 return self._string 

80 

81 def __repr__(self) -> str: 

82 return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>" 

83 

84 def add_submatch(self, other: CharsetMatch) -> None: 

85 if not isinstance(other, CharsetMatch) or other == self: 

86 raise ValueError( 

87 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

88 other.__class__ 

89 ) 

90 ) 

91 

92 other._string = None # Unload RAM usage; dirty trick. 

93 self._leaves.append(other) 

94 

95 @property 

96 def encoding(self) -> str: 

97 return self._encoding 

98 

99 @property 

100 def encoding_aliases(self) -> list[str]: 

101 """ 

102 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

103 """ 

104 also_known_as: list[str] = [] 

105 for u, p in aliases.items(): 

106 if self.encoding == u: 

107 also_known_as.append(p) 

108 elif self.encoding == p: 

109 also_known_as.append(u) 

110 return also_known_as 

111 

112 @property 

113 def bom(self) -> bool: 

114 return self._has_sig_or_bom 

115 

116 @property 

117 def byte_order_mark(self) -> bool: 

118 return self._has_sig_or_bom 

119 

120 @property 

121 def languages(self) -> list[str]: 

122 """ 

123 Return the complete list of possible languages found in decoded sequence. 

124 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

125 """ 

126 return [e[0] for e in self._languages] 

127 

128 @property 

129 def language(self) -> str: 

130 """ 

131 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

132 "Unknown". 

133 """ 

134 if not self._languages: 

135 # Trying to infer the language based on the given encoding 

136 # Its either English or we should not pronounce ourselves in certain cases. 

137 if "ascii" in self.could_be_from_charset: 

138 return "English" 

139 

140 # doing it there to avoid circular import 

141 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

142 

143 languages = ( 

144 mb_encoding_languages(self.encoding) 

145 if is_multi_byte_encoding(self.encoding) 

146 else encoding_languages(self.encoding) 

147 ) 

148 

149 if len(languages) == 0 or "Latin Based" in languages: 

150 return "Unknown" 

151 

152 return languages[0] 

153 

154 return self._languages[0][0] 

155 

156 @property 

157 def chaos(self) -> float: 

158 return self._mean_mess_ratio 

159 

160 @property 

161 def coherence(self) -> float: 

162 if not self._languages: 

163 return 0.0 

164 return self._languages[0][1] 

165 

166 @property 

167 def percent_chaos(self) -> float: 

168 return round(self.chaos * 100, ndigits=3) 

169 

170 @property 

171 def percent_coherence(self) -> float: 

172 return round(self.coherence * 100, ndigits=3) 

173 

174 @property 

175 def raw(self) -> bytes: 

176 """ 

177 Original untouched bytes. 

178 """ 

179 return self._payload 

180 

181 @property 

182 def submatch(self) -> list[CharsetMatch]: 

183 return self._leaves 

184 

185 @property 

186 def has_submatch(self) -> bool: 

187 return len(self._leaves) > 0 

188 

189 @property 

190 def alphabets(self) -> list[str]: 

191 if self._unicode_ranges is not None: 

192 return self._unicode_ranges 

193 # list detected ranges 

194 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] 

195 # filter and sort 

196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

197 return self._unicode_ranges 

198 

199 @property 

200 def could_be_from_charset(self) -> list[str]: 

201 """ 

202 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

203 encoding. 

204 This list does include the encoding available in property 'encoding'. 

205 """ 

206 return [self._encoding] + [m.encoding for m in self._leaves] 

207 

208 def output(self, encoding: str = "utf_8") -> bytes: 

209 """ 

210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

211 Any errors will be simply ignored by the encoder NOT replaced. 

212 """ 

213 if self._output_encoding is None or self._output_encoding != encoding: 

214 self._output_encoding = encoding 

215 decoded_string = str(self) 

216 if ( 

217 self._preemptive_declaration is not None 

218 and self._preemptive_declaration.lower() 

219 not in ["utf-8", "utf8", "utf_8"] 

220 ): 

221 patched_header = sub( 

222 RE_POSSIBLE_ENCODING_INDICATION, 

223 lambda m: m.string[m.span()[0] : m.span()[1]].replace( 

224 m.groups()[0], 

225 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type] 

226 ), 

227 decoded_string[:8192], 

228 count=1, 

229 ) 

230 

231 decoded_string = patched_header + decoded_string[8192:] 

232 

233 self._output_payload = decoded_string.encode(encoding, "replace") 

234 

235 return self._output_payload # type: ignore 

236 

237 @property 

238 def fingerprint(self) -> str: 

239 """ 

240 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 

241 """ 

242 return sha256(self.output()).hexdigest() 

243 

244 

245class CharsetMatches: 

246 """ 

247 Container with every CharsetMatch items ordered by default from most probable to the less one. 

248 Act like a list(iterable) but does not implements all related methods. 

249 """ 

250 

251 def __init__(self, results: list[CharsetMatch] | None = None): 

252 self._results: list[CharsetMatch] = sorted(results) if results else [] 

253 

254 def __iter__(self) -> Iterator[CharsetMatch]: 

255 yield from self._results 

256 

257 def __getitem__(self, item: int | str) -> CharsetMatch: 

258 """ 

259 Retrieve a single item either by its position or encoding name (alias may be used here). 

260 Raise KeyError upon invalid index or encoding not present in results. 

261 """ 

262 if isinstance(item, int): 

263 return self._results[item] 

264 if isinstance(item, str): 

265 item = iana_name(item, False) 

266 for result in self._results: 

267 if item in result.could_be_from_charset: 

268 return result 

269 raise KeyError 

270 

271 def __len__(self) -> int: 

272 return len(self._results) 

273 

274 def __bool__(self) -> bool: 

275 return len(self._results) > 0 

276 

277 def append(self, item: CharsetMatch) -> None: 

278 """ 

279 Insert a single match. Will be inserted accordingly to preserve sort. 

280 Can be inserted as a submatch. 

281 """ 

282 if not isinstance(item, CharsetMatch): 

283 raise ValueError( 

284 "Cannot append instance '{}' to CharsetMatches".format( 

285 str(item.__class__) 

286 ) 

287 ) 

288 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

289 if len(item.raw) < TOO_BIG_SEQUENCE: 

290 for match in self._results: 

291 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

292 match.add_submatch(item) 

293 return 

294 self._results.append(item) 

295 self._results = sorted(self._results) 

296 

297 def best(self) -> CharsetMatch | None: 

298 """ 

299 Simply return the first match. Strict equivalent to matches[0]. 

300 """ 

301 if not self._results: 

302 return None 

303 return self._results[0] 

304 

305 def first(self) -> CharsetMatch | None: 

306 """ 

307 Redundant method, call the method best(). Kept for BC reasons. 

308 """ 

309 return self.best() 

310 

311 

312CoherenceMatch = Tuple[str, float] 

313CoherenceMatches = List[CoherenceMatch] 

314 

315 

316class CliDetectionResult: 

317 def __init__( 

318 self, 

319 path: str, 

320 encoding: str | None, 

321 encoding_aliases: list[str], 

322 alternative_encodings: list[str], 

323 language: str, 

324 alphabets: list[str], 

325 has_sig_or_bom: bool, 

326 chaos: float, 

327 coherence: float, 

328 unicode_path: str | None, 

329 is_preferred: bool, 

330 ): 

331 self.path: str = path 

332 self.unicode_path: str | None = unicode_path 

333 self.encoding: str | None = encoding 

334 self.encoding_aliases: list[str] = encoding_aliases 

335 self.alternative_encodings: list[str] = alternative_encodings 

336 self.language: str = language 

337 self.alphabets: list[str] = alphabets 

338 self.has_sig_or_bom: bool = has_sig_or_bom 

339 self.chaos: float = chaos 

340 self.coherence: float = coherence 

341 self.is_preferred: bool = is_preferred 

342 

343 @property 

344 def __dict__(self) -> dict[str, Any]: # type: ignore 

345 return { 

346 "path": self.path, 

347 "encoding": self.encoding, 

348 "encoding_aliases": self.encoding_aliases, 

349 "alternative_encodings": self.alternative_encodings, 

350 "language": self.language, 

351 "alphabets": self.alphabets, 

352 "has_sig_or_bom": self.has_sig_or_bom, 

353 "chaos": self.chaos, 

354 "coherence": self.coherence, 

355 "unicode_path": self.unicode_path, 

356 "is_preferred": self.is_preferred, 

357 } 

358 

359 def to_json(self) -> str: 

360 return dumps(self.__dict__, ensure_ascii=True, indent=4)