Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

186 statements  

1from __future__ import annotations 

2 

3from encodings.aliases import aliases 

4from json import dumps 

5from re import sub 

6from typing import Any, Iterator, List, Tuple 

7 

8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE 

9from .utils import iana_name, is_multi_byte_encoding, unicode_range 

10 

11 

12class CharsetMatch: 

13 def __init__( 

14 self, 

15 payload: bytes | bytearray, 

16 guessed_encoding: str, 

17 mean_mess_ratio: float, 

18 has_sig_or_bom: bool, 

19 languages: CoherenceMatches, 

20 decoded_payload: str | None = None, 

21 preemptive_declaration: str | None = None, 

22 ): 

23 self._payload: bytes | bytearray = payload 

24 

25 self._encoding: str = guessed_encoding 

26 self._mean_mess_ratio: float = mean_mess_ratio 

27 self._languages: CoherenceMatches = languages 

28 self._has_sig_or_bom: bool = has_sig_or_bom 

29 self._unicode_ranges: list[str] | None = None 

30 

31 self._leaves: list[CharsetMatch] = [] 

32 self._mean_coherence_ratio: float = 0.0 

33 

34 self._output_payload: bytes | None = None 

35 self._output_encoding: str | None = None 

36 

37 self._string: str | None = decoded_payload 

38 

39 self._preemptive_declaration: str | None = preemptive_declaration 

40 

41 def __eq__(self, other: object) -> bool: 

42 if not isinstance(other, CharsetMatch): 

43 if isinstance(other, str): 

44 return iana_name(other) == self.encoding 

45 return False 

46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

47 

48 def __lt__(self, other: object) -> bool: 

49 """ 

50 Implemented to make sorted available upon CharsetMatches items. 

51 """ 

52 if not isinstance(other, CharsetMatch): 

53 raise ValueError 

54 

55 chaos_difference: float = abs(self.chaos - other.chaos) 

56 coherence_difference: float = abs(self.coherence - other.coherence) 

57 

58 # Below 0.5% difference --> Use Coherence 

59 if chaos_difference < 0.005 and coherence_difference > 0.02: 

60 return self.coherence > other.coherence 

61 elif chaos_difference < 0.005 and coherence_difference <= 0.02: 

62 # When having a difficult decision, use the result that decoded as many multi-byte as possible. 

63 # preserve RAM usage! 

64 if len(self._payload) >= TOO_BIG_SEQUENCE: 

65 return self.chaos < other.chaos 

66 return self.multi_byte_usage > other.multi_byte_usage 

67 

68 return self.chaos < other.chaos 

69 

70 @property 

71 def multi_byte_usage(self) -> float: 

72 return 1.0 - (len(str(self)) / len(self.raw)) 

73 

74 def __str__(self) -> str: 

75 # Lazy Str Loading 

76 if self._string is None: 

77 self._string = str(self._payload, self._encoding, "strict") 

78 # UTF-7 BOM is encoded in modified Base64 whose byte boundary 

79 # can overlap with the next character, so raw-byte stripping 

80 # is unreliable. Strip the decoded BOM character instead. 

81 if ( 

82 self._has_sig_or_bom 

83 and self._encoding == "utf_7" 

84 and self._string 

85 and self._string[0] == "\ufeff" 

86 ): 

87 self._string = self._string[1:] 

88 return self._string 

89 

90 def __repr__(self) -> str: 

91 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>" 

92 

93 def add_submatch(self, other: CharsetMatch) -> None: 

94 if not isinstance(other, CharsetMatch) or other == self: 

95 raise ValueError( 

96 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

97 other.__class__ 

98 ) 

99 ) 

100 

101 other._string = None # Unload RAM usage; dirty trick. 

102 self._leaves.append(other) 

103 

104 @property 

105 def encoding(self) -> str: 

106 return self._encoding 

107 

108 @property 

109 def encoding_aliases(self) -> list[str]: 

110 """ 

111 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

112 """ 

113 also_known_as: list[str] = [] 

114 for u, p in aliases.items(): 

115 if self.encoding == u: 

116 also_known_as.append(p) 

117 elif self.encoding == p: 

118 also_known_as.append(u) 

119 return also_known_as 

120 

121 @property 

122 def bom(self) -> bool: 

123 return self._has_sig_or_bom 

124 

125 @property 

126 def byte_order_mark(self) -> bool: 

127 return self._has_sig_or_bom 

128 

129 @property 

130 def languages(self) -> list[str]: 

131 """ 

132 Return the complete list of possible languages found in decoded sequence. 

133 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

134 """ 

135 return [e[0] for e in self._languages] 

136 

137 @property 

138 def language(self) -> str: 

139 """ 

140 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

141 "Unknown". 

142 """ 

143 if not self._languages: 

144 # Trying to infer the language based on the given encoding 

145 # Its either English or we should not pronounce ourselves in certain cases. 

146 if "ascii" in self.could_be_from_charset: 

147 return "English" 

148 

149 # doing it there to avoid circular import 

150 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

151 

152 languages = ( 

153 mb_encoding_languages(self.encoding) 

154 if is_multi_byte_encoding(self.encoding) 

155 else encoding_languages(self.encoding) 

156 ) 

157 

158 if len(languages) == 0 or "Latin Based" in languages: 

159 return "Unknown" 

160 

161 return languages[0] 

162 

163 return self._languages[0][0] 

164 

165 @property 

166 def chaos(self) -> float: 

167 return self._mean_mess_ratio 

168 

169 @property 

170 def coherence(self) -> float: 

171 if not self._languages: 

172 return 0.0 

173 return self._languages[0][1] 

174 

175 @property 

176 def percent_chaos(self) -> float: 

177 return round(self.chaos * 100, ndigits=3) 

178 

179 @property 

180 def percent_coherence(self) -> float: 

181 return round(self.coherence * 100, ndigits=3) 

182 

183 @property 

184 def raw(self) -> bytes | bytearray: 

185 """ 

186 Original untouched bytes. 

187 """ 

188 return self._payload 

189 

190 @property 

191 def submatch(self) -> list[CharsetMatch]: 

192 return self._leaves 

193 

194 @property 

195 def has_submatch(self) -> bool: 

196 return len(self._leaves) > 0 

197 

198 @property 

199 def alphabets(self) -> list[str]: 

200 if self._unicode_ranges is not None: 

201 return self._unicode_ranges 

202 # list detected ranges 

203 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] 

204 # filter and sort 

205 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

206 return self._unicode_ranges 

207 

208 @property 

209 def could_be_from_charset(self) -> list[str]: 

210 """ 

211 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

212 encoding. 

213 This list does include the encoding available in property 'encoding'. 

214 """ 

215 return [self._encoding] + [m.encoding for m in self._leaves] 

216 

217 def output(self, encoding: str = "utf_8") -> bytes: 

218 """ 

219 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

220 Any errors will be simply ignored by the encoder NOT replaced. 

221 """ 

222 if self._output_encoding is None or self._output_encoding != encoding: 

223 self._output_encoding = encoding 

224 decoded_string = str(self) 

225 if ( 

226 self._preemptive_declaration is not None 

227 and self._preemptive_declaration.lower() 

228 not in ["utf-8", "utf8", "utf_8"] 

229 ): 

230 patched_header = sub( 

231 RE_POSSIBLE_ENCODING_INDICATION, 

232 lambda m: m.string[m.span()[0] : m.span()[1]].replace( 

233 m.groups()[0], 

234 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type] 

235 ), 

236 decoded_string[:8192], 

237 count=1, 

238 ) 

239 

240 decoded_string = patched_header + decoded_string[8192:] 

241 

242 self._output_payload = decoded_string.encode(encoding, "replace") 

243 

244 return self._output_payload # type: ignore 

245 

246 @property 

247 def fingerprint(self) -> int: 

248 """ 

249 Retrieve a hash fingerprint of the decoded payload, used for deduplication. 

250 """ 

251 return hash(str(self)) 

252 

253 

254class CharsetMatches: 

255 """ 

256 Container with every CharsetMatch items ordered by default from most probable to the less one. 

257 Act like a list(iterable) but does not implements all related methods. 

258 """ 

259 

260 def __init__(self, results: list[CharsetMatch] | None = None): 

261 self._results: list[CharsetMatch] = sorted(results) if results else [] 

262 

263 def __iter__(self) -> Iterator[CharsetMatch]: 

264 yield from self._results 

265 

266 def __getitem__(self, item: int | str) -> CharsetMatch: 

267 """ 

268 Retrieve a single item either by its position or encoding name (alias may be used here). 

269 Raise KeyError upon invalid index or encoding not present in results. 

270 """ 

271 if isinstance(item, int): 

272 return self._results[item] 

273 if isinstance(item, str): 

274 item = iana_name(item, False) 

275 for result in self._results: 

276 if item in result.could_be_from_charset: 

277 return result 

278 raise KeyError 

279 

280 def __len__(self) -> int: 

281 return len(self._results) 

282 

283 def __bool__(self) -> bool: 

284 return len(self._results) > 0 

285 

286 def append(self, item: CharsetMatch) -> None: 

287 """ 

288 Insert a single match. Will be inserted accordingly to preserve sort. 

289 Can be inserted as a submatch. 

290 """ 

291 if not isinstance(item, CharsetMatch): 

292 raise ValueError( 

293 "Cannot append instance '{}' to CharsetMatches".format( 

294 str(item.__class__) 

295 ) 

296 ) 

297 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

298 if len(item.raw) < TOO_BIG_SEQUENCE: 

299 for match in self._results: 

300 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

301 match.add_submatch(item) 

302 return 

303 self._results.append(item) 

304 self._results = sorted(self._results) 

305 

306 def best(self) -> CharsetMatch | None: 

307 """ 

308 Simply return the first match. Strict equivalent to matches[0]. 

309 """ 

310 if not self._results: 

311 return None 

312 return self._results[0] 

313 

314 def first(self) -> CharsetMatch | None: 

315 """ 

316 Redundant method, call the method best(). Kept for BC reasons. 

317 """ 

318 return self.best() 

319 

320 

321CoherenceMatch = Tuple[str, float] 

322CoherenceMatches = List[CoherenceMatch] 

323 

324 

325class CliDetectionResult: 

326 def __init__( 

327 self, 

328 path: str, 

329 encoding: str | None, 

330 encoding_aliases: list[str], 

331 alternative_encodings: list[str], 

332 language: str, 

333 alphabets: list[str], 

334 has_sig_or_bom: bool, 

335 chaos: float, 

336 coherence: float, 

337 unicode_path: str | None, 

338 is_preferred: bool, 

339 ): 

340 self.path: str = path 

341 self.unicode_path: str | None = unicode_path 

342 self.encoding: str | None = encoding 

343 self.encoding_aliases: list[str] = encoding_aliases 

344 self.alternative_encodings: list[str] = alternative_encodings 

345 self.language: str = language 

346 self.alphabets: list[str] = alphabets 

347 self.has_sig_or_bom: bool = has_sig_or_bom 

348 self.chaos: float = chaos 

349 self.coherence: float = coherence 

350 self.is_preferred: bool = is_preferred 

351 

352 @property 

353 def __dict__(self) -> dict[str, Any]: # type: ignore 

354 return { 

355 "path": self.path, 

356 "encoding": self.encoding, 

357 "encoding_aliases": self.encoding_aliases, 

358 "alternative_encodings": self.alternative_encodings, 

359 "language": self.language, 

360 "alphabets": self.alphabets, 

361 "has_sig_or_bom": self.has_sig_or_bom, 

362 "chaos": self.chaos, 

363 "coherence": self.coherence, 

364 "unicode_path": self.unicode_path, 

365 "is_preferred": self.is_preferred, 

366 } 

367 

368 def to_json(self) -> str: 

369 return dumps(self.__dict__, ensure_ascii=True, indent=4)