Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

184 statements  

1from __future__ import annotations 

2 

3from encodings.aliases import aliases 

4from json import dumps 

5from re import sub 

6from typing import Any, Iterator, List, Tuple 

7 

8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE 

9from .utils import iana_name, is_multi_byte_encoding, unicode_range 

10 

11 

12class CharsetMatch: 

13 def __init__( 

14 self, 

15 payload: bytes, 

16 guessed_encoding: str, 

17 mean_mess_ratio: float, 

18 has_sig_or_bom: bool, 

19 languages: CoherenceMatches, 

20 decoded_payload: str | None = None, 

21 preemptive_declaration: str | None = None, 

22 ): 

23 self._payload: bytes = payload 

24 

25 self._encoding: str = guessed_encoding 

26 self._mean_mess_ratio: float = mean_mess_ratio 

27 self._languages: CoherenceMatches = languages 

28 self._has_sig_or_bom: bool = has_sig_or_bom 

29 self._unicode_ranges: list[str] | None = None 

30 

31 self._leaves: list[CharsetMatch] = [] 

32 self._mean_coherence_ratio: float = 0.0 

33 

34 self._output_payload: bytes | None = None 

35 self._output_encoding: str | None = None 

36 

37 self._string: str | None = decoded_payload 

38 

39 self._preemptive_declaration: str | None = preemptive_declaration 

40 

41 def __eq__(self, other: object) -> bool: 

42 if not isinstance(other, CharsetMatch): 

43 if isinstance(other, str): 

44 return iana_name(other) == self.encoding 

45 return False 

46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

47 

48 def __lt__(self, other: object) -> bool: 

49 """ 

50 Implemented to make sorted available upon CharsetMatches items. 

51 """ 

52 if not isinstance(other, CharsetMatch): 

53 raise ValueError 

54 

55 chaos_difference: float = abs(self.chaos - other.chaos) 

56 coherence_difference: float = abs(self.coherence - other.coherence) 

57 

58 # Below 1% difference --> Use Coherence 

59 if chaos_difference < 0.01 and coherence_difference > 0.02: 

60 return self.coherence > other.coherence 

61 elif chaos_difference < 0.01 and coherence_difference <= 0.02: 

62 # When having a difficult decision, use the result that decoded as many multi-byte as possible. 

63 # preserve RAM usage! 

64 if len(self._payload) >= TOO_BIG_SEQUENCE: 

65 return self.chaos < other.chaos 

66 return self.multi_byte_usage > other.multi_byte_usage 

67 

68 return self.chaos < other.chaos 

69 

70 @property 

71 def multi_byte_usage(self) -> float: 

72 return 1.0 - (len(str(self)) / len(self.raw)) 

73 

74 def __str__(self) -> str: 

75 # Lazy Str Loading 

76 if self._string is None: 

77 self._string = str(self._payload, self._encoding, "strict") 

78 return self._string 

79 

80 def __repr__(self) -> str: 

81 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>" 

82 

83 def add_submatch(self, other: CharsetMatch) -> None: 

84 if not isinstance(other, CharsetMatch) or other == self: 

85 raise ValueError( 

86 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

87 other.__class__ 

88 ) 

89 ) 

90 

91 other._string = None # Unload RAM usage; dirty trick. 

92 self._leaves.append(other) 

93 

94 @property 

95 def encoding(self) -> str: 

96 return self._encoding 

97 

98 @property 

99 def encoding_aliases(self) -> list[str]: 

100 """ 

101 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

102 """ 

103 also_known_as: list[str] = [] 

104 for u, p in aliases.items(): 

105 if self.encoding == u: 

106 also_known_as.append(p) 

107 elif self.encoding == p: 

108 also_known_as.append(u) 

109 return also_known_as 

110 

111 @property 

112 def bom(self) -> bool: 

113 return self._has_sig_or_bom 

114 

115 @property 

116 def byte_order_mark(self) -> bool: 

117 return self._has_sig_or_bom 

118 

119 @property 

120 def languages(self) -> list[str]: 

121 """ 

122 Return the complete list of possible languages found in decoded sequence. 

123 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

124 """ 

125 return [e[0] for e in self._languages] 

126 

127 @property 

128 def language(self) -> str: 

129 """ 

130 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

131 "Unknown". 

132 """ 

133 if not self._languages: 

134 # Trying to infer the language based on the given encoding 

135 # Its either English or we should not pronounce ourselves in certain cases. 

136 if "ascii" in self.could_be_from_charset: 

137 return "English" 

138 

139 # doing it there to avoid circular import 

140 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

141 

142 languages = ( 

143 mb_encoding_languages(self.encoding) 

144 if is_multi_byte_encoding(self.encoding) 

145 else encoding_languages(self.encoding) 

146 ) 

147 

148 if len(languages) == 0 or "Latin Based" in languages: 

149 return "Unknown" 

150 

151 return languages[0] 

152 

153 return self._languages[0][0] 

154 

155 @property 

156 def chaos(self) -> float: 

157 return self._mean_mess_ratio 

158 

159 @property 

160 def coherence(self) -> float: 

161 if not self._languages: 

162 return 0.0 

163 return self._languages[0][1] 

164 

165 @property 

166 def percent_chaos(self) -> float: 

167 return round(self.chaos * 100, ndigits=3) 

168 

169 @property 

170 def percent_coherence(self) -> float: 

171 return round(self.coherence * 100, ndigits=3) 

172 

173 @property 

174 def raw(self) -> bytes: 

175 """ 

176 Original untouched bytes. 

177 """ 

178 return self._payload 

179 

180 @property 

181 def submatch(self) -> list[CharsetMatch]: 

182 return self._leaves 

183 

184 @property 

185 def has_submatch(self) -> bool: 

186 return len(self._leaves) > 0 

187 

188 @property 

189 def alphabets(self) -> list[str]: 

190 if self._unicode_ranges is not None: 

191 return self._unicode_ranges 

192 # list detected ranges 

193 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] 

194 # filter and sort 

195 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

196 return self._unicode_ranges 

197 

198 @property 

199 def could_be_from_charset(self) -> list[str]: 

200 """ 

201 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

202 encoding. 

203 This list does include the encoding available in property 'encoding'. 

204 """ 

205 return [self._encoding] + [m.encoding for m in self._leaves] 

206 

207 def output(self, encoding: str = "utf_8") -> bytes: 

208 """ 

209 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

210 Any errors will be simply ignored by the encoder NOT replaced. 

211 """ 

212 if self._output_encoding is None or self._output_encoding != encoding: 

213 self._output_encoding = encoding 

214 decoded_string = str(self) 

215 if ( 

216 self._preemptive_declaration is not None 

217 and self._preemptive_declaration.lower() 

218 not in ["utf-8", "utf8", "utf_8"] 

219 ): 

220 patched_header = sub( 

221 RE_POSSIBLE_ENCODING_INDICATION, 

222 lambda m: m.string[m.span()[0] : m.span()[1]].replace( 

223 m.groups()[0], 

224 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type] 

225 ), 

226 decoded_string[:8192], 

227 count=1, 

228 ) 

229 

230 decoded_string = patched_header + decoded_string[8192:] 

231 

232 self._output_payload = decoded_string.encode(encoding, "replace") 

233 

234 return self._output_payload # type: ignore 

235 

236 @property 

237 def fingerprint(self) -> int: 

238 """ 

239 Retrieve a hash fingerprint of the decoded payload, used for deduplication. 

240 """ 

241 return hash(str(self)) 

242 

243 

244class CharsetMatches: 

245 """ 

246 Container with every CharsetMatch items ordered by default from most probable to the less one. 

247 Act like a list(iterable) but does not implements all related methods. 

248 """ 

249 

250 def __init__(self, results: list[CharsetMatch] | None = None): 

251 self._results: list[CharsetMatch] = sorted(results) if results else [] 

252 

253 def __iter__(self) -> Iterator[CharsetMatch]: 

254 yield from self._results 

255 

256 def __getitem__(self, item: int | str) -> CharsetMatch: 

257 """ 

258 Retrieve a single item either by its position or encoding name (alias may be used here). 

259 Raise KeyError upon invalid index or encoding not present in results. 

260 """ 

261 if isinstance(item, int): 

262 return self._results[item] 

263 if isinstance(item, str): 

264 item = iana_name(item, False) 

265 for result in self._results: 

266 if item in result.could_be_from_charset: 

267 return result 

268 raise KeyError 

269 

270 def __len__(self) -> int: 

271 return len(self._results) 

272 

273 def __bool__(self) -> bool: 

274 return len(self._results) > 0 

275 

276 def append(self, item: CharsetMatch) -> None: 

277 """ 

278 Insert a single match. Will be inserted accordingly to preserve sort. 

279 Can be inserted as a submatch. 

280 """ 

281 if not isinstance(item, CharsetMatch): 

282 raise ValueError( 

283 "Cannot append instance '{}' to CharsetMatches".format( 

284 str(item.__class__) 

285 ) 

286 ) 

287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

288 if len(item.raw) < TOO_BIG_SEQUENCE: 

289 for match in self._results: 

290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

291 match.add_submatch(item) 

292 return 

293 self._results.append(item) 

294 self._results = sorted(self._results) 

295 

296 def best(self) -> CharsetMatch | None: 

297 """ 

298 Simply return the first match. Strict equivalent to matches[0]. 

299 """ 

300 if not self._results: 

301 return None 

302 return self._results[0] 

303 

304 def first(self) -> CharsetMatch | None: 

305 """ 

306 Redundant method, call the method best(). Kept for BC reasons. 

307 """ 

308 return self.best() 

309 

310 

311CoherenceMatch = Tuple[str, float] 

312CoherenceMatches = List[CoherenceMatch] 

313 

314 

315class CliDetectionResult: 

316 def __init__( 

317 self, 

318 path: str, 

319 encoding: str | None, 

320 encoding_aliases: list[str], 

321 alternative_encodings: list[str], 

322 language: str, 

323 alphabets: list[str], 

324 has_sig_or_bom: bool, 

325 chaos: float, 

326 coherence: float, 

327 unicode_path: str | None, 

328 is_preferred: bool, 

329 ): 

330 self.path: str = path 

331 self.unicode_path: str | None = unicode_path 

332 self.encoding: str | None = encoding 

333 self.encoding_aliases: list[str] = encoding_aliases 

334 self.alternative_encodings: list[str] = alternative_encodings 

335 self.language: str = language 

336 self.alphabets: list[str] = alphabets 

337 self.has_sig_or_bom: bool = has_sig_or_bom 

338 self.chaos: float = chaos 

339 self.coherence: float = coherence 

340 self.is_preferred: bool = is_preferred 

341 

342 @property 

343 def __dict__(self) -> dict[str, Any]: # type: ignore 

344 return { 

345 "path": self.path, 

346 "encoding": self.encoding, 

347 "encoding_aliases": self.encoding_aliases, 

348 "alternative_encodings": self.alternative_encodings, 

349 "language": self.language, 

350 "alphabets": self.alphabets, 

351 "has_sig_or_bom": self.has_sig_or_bom, 

352 "chaos": self.chaos, 

353 "coherence": self.coherence, 

354 "unicode_path": self.unicode_path, 

355 "is_preferred": self.is_preferred, 

356 } 

357 

358 def to_json(self) -> str: 

359 return dumps(self.__dict__, ensure_ascii=True, indent=4)