Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/charset_normalizer/models.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

184 statements  

1from encodings.aliases import aliases 

2from hashlib import sha256 

3from json import dumps 

4from re import sub 

5from typing import Any, Dict, Iterator, List, Optional, Tuple, Union 

6 

7from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE 

8from .utils import iana_name, is_multi_byte_encoding, unicode_range 

9 

10 

11class CharsetMatch: 

12 def __init__( 

13 self, 

14 payload: bytes, 

15 guessed_encoding: str, 

16 mean_mess_ratio: float, 

17 has_sig_or_bom: bool, 

18 languages: "CoherenceMatches", 

19 decoded_payload: Optional[str] = None, 

20 preemptive_declaration: Optional[str] = None, 

21 ): 

22 self._payload: bytes = payload 

23 

24 self._encoding: str = guessed_encoding 

25 self._mean_mess_ratio: float = mean_mess_ratio 

26 self._languages: CoherenceMatches = languages 

27 self._has_sig_or_bom: bool = has_sig_or_bom 

28 self._unicode_ranges: Optional[List[str]] = None 

29 

30 self._leaves: List[CharsetMatch] = [] 

31 self._mean_coherence_ratio: float = 0.0 

32 

33 self._output_payload: Optional[bytes] = None 

34 self._output_encoding: Optional[str] = None 

35 

36 self._string: Optional[str] = decoded_payload 

37 

38 self._preemptive_declaration: Optional[str] = preemptive_declaration 

39 

40 def __eq__(self, other: object) -> bool: 

41 if not isinstance(other, CharsetMatch): 

42 if isinstance(other, str): 

43 return iana_name(other) == self.encoding 

44 return False 

45 return self.encoding == other.encoding and self.fingerprint == other.fingerprint 

46 

47 def __lt__(self, other: object) -> bool: 

48 """ 

49 Implemented to make sorted available upon CharsetMatches items. 

50 """ 

51 if not isinstance(other, CharsetMatch): 

52 raise ValueError 

53 

54 chaos_difference: float = abs(self.chaos - other.chaos) 

55 coherence_difference: float = abs(self.coherence - other.coherence) 

56 

57 # Below 1% difference --> Use Coherence 

58 if chaos_difference < 0.01 and coherence_difference > 0.02: 

59 return self.coherence > other.coherence 

60 elif chaos_difference < 0.01 and coherence_difference <= 0.02: 

61 # When having a difficult decision, use the result that decoded as many multi-byte as possible. 

62 # preserve RAM usage! 

63 if len(self._payload) >= TOO_BIG_SEQUENCE: 

64 return self.chaos < other.chaos 

65 return self.multi_byte_usage > other.multi_byte_usage 

66 

67 return self.chaos < other.chaos 

68 

69 @property 

70 def multi_byte_usage(self) -> float: 

71 return 1.0 - (len(str(self)) / len(self.raw)) 

72 

73 def __str__(self) -> str: 

74 # Lazy Str Loading 

75 if self._string is None: 

76 self._string = str(self._payload, self._encoding, "strict") 

77 return self._string 

78 

79 def __repr__(self) -> str: 

80 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint) 

81 

82 def add_submatch(self, other: "CharsetMatch") -> None: 

83 if not isinstance(other, CharsetMatch) or other == self: 

84 raise ValueError( 

85 "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 

86 other.__class__ 

87 ) 

88 ) 

89 

90 other._string = None # Unload RAM usage; dirty trick. 

91 self._leaves.append(other) 

92 

93 @property 

94 def encoding(self) -> str: 

95 return self._encoding 

96 

97 @property 

98 def encoding_aliases(self) -> List[str]: 

99 """ 

100 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 

101 """ 

102 also_known_as: List[str] = [] 

103 for u, p in aliases.items(): 

104 if self.encoding == u: 

105 also_known_as.append(p) 

106 elif self.encoding == p: 

107 also_known_as.append(u) 

108 return also_known_as 

109 

110 @property 

111 def bom(self) -> bool: 

112 return self._has_sig_or_bom 

113 

114 @property 

115 def byte_order_mark(self) -> bool: 

116 return self._has_sig_or_bom 

117 

118 @property 

119 def languages(self) -> List[str]: 

120 """ 

121 Return the complete list of possible languages found in decoded sequence. 

122 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 

123 """ 

124 return [e[0] for e in self._languages] 

125 

126 @property 

127 def language(self) -> str: 

128 """ 

129 Most probable language found in decoded sequence. If none were detected or inferred, the property will return 

130 "Unknown". 

131 """ 

132 if not self._languages: 

133 # Trying to infer the language based on the given encoding 

134 # Its either English or we should not pronounce ourselves in certain cases. 

135 if "ascii" in self.could_be_from_charset: 

136 return "English" 

137 

138 # doing it there to avoid circular import 

139 from charset_normalizer.cd import encoding_languages, mb_encoding_languages 

140 

141 languages = ( 

142 mb_encoding_languages(self.encoding) 

143 if is_multi_byte_encoding(self.encoding) 

144 else encoding_languages(self.encoding) 

145 ) 

146 

147 if len(languages) == 0 or "Latin Based" in languages: 

148 return "Unknown" 

149 

150 return languages[0] 

151 

152 return self._languages[0][0] 

153 

154 @property 

155 def chaos(self) -> float: 

156 return self._mean_mess_ratio 

157 

158 @property 

159 def coherence(self) -> float: 

160 if not self._languages: 

161 return 0.0 

162 return self._languages[0][1] 

163 

164 @property 

165 def percent_chaos(self) -> float: 

166 return round(self.chaos * 100, ndigits=3) 

167 

168 @property 

169 def percent_coherence(self) -> float: 

170 return round(self.coherence * 100, ndigits=3) 

171 

172 @property 

173 def raw(self) -> bytes: 

174 """ 

175 Original untouched bytes. 

176 """ 

177 return self._payload 

178 

179 @property 

180 def submatch(self) -> List["CharsetMatch"]: 

181 return self._leaves 

182 

183 @property 

184 def has_submatch(self) -> bool: 

185 return len(self._leaves) > 0 

186 

187 @property 

188 def alphabets(self) -> List[str]: 

189 if self._unicode_ranges is not None: 

190 return self._unicode_ranges 

191 # list detected ranges 

192 detected_ranges: List[Optional[str]] = [ 

193 unicode_range(char) for char in str(self) 

194 ] 

195 # filter and sort 

196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 

197 return self._unicode_ranges 

198 

199 @property 

200 def could_be_from_charset(self) -> List[str]: 

201 """ 

202 The complete list of encoding that output the exact SAME str result and therefore could be the originating 

203 encoding. 

204 This list does include the encoding available in property 'encoding'. 

205 """ 

206 return [self._encoding] + [m.encoding for m in self._leaves] 

207 

208 def output(self, encoding: str = "utf_8") -> bytes: 

209 """ 

210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 

211 Any errors will be simply ignored by the encoder NOT replaced. 

212 """ 

213 if self._output_encoding is None or self._output_encoding != encoding: 

214 self._output_encoding = encoding 

215 decoded_string = str(self) 

216 if ( 

217 self._preemptive_declaration is not None 

218 and self._preemptive_declaration.lower() 

219 not in ["utf-8", "utf8", "utf_8"] 

220 ): 

221 patched_header = sub( 

222 RE_POSSIBLE_ENCODING_INDICATION, 

223 lambda m: m.string[m.span()[0] : m.span()[1]].replace( 

224 m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type] 

225 ), 

226 decoded_string[:8192], 

227 1, 

228 ) 

229 

230 decoded_string = patched_header + decoded_string[8192:] 

231 

232 self._output_payload = decoded_string.encode(encoding, "replace") 

233 

234 return self._output_payload # type: ignore 

235 

236 @property 

237 def fingerprint(self) -> str: 

238 """ 

239 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 

240 """ 

241 return sha256(self.output()).hexdigest() 

242 

243 

244class CharsetMatches: 

245 """ 

246 Container with every CharsetMatch items ordered by default from most probable to the less one. 

247 Act like a list(iterable) but does not implements all related methods. 

248 """ 

249 

250 def __init__(self, results: Optional[List[CharsetMatch]] = None): 

251 self._results: List[CharsetMatch] = sorted(results) if results else [] 

252 

253 def __iter__(self) -> Iterator[CharsetMatch]: 

254 yield from self._results 

255 

256 def __getitem__(self, item: Union[int, str]) -> CharsetMatch: 

257 """ 

258 Retrieve a single item either by its position or encoding name (alias may be used here). 

259 Raise KeyError upon invalid index or encoding not present in results. 

260 """ 

261 if isinstance(item, int): 

262 return self._results[item] 

263 if isinstance(item, str): 

264 item = iana_name(item, False) 

265 for result in self._results: 

266 if item in result.could_be_from_charset: 

267 return result 

268 raise KeyError 

269 

270 def __len__(self) -> int: 

271 return len(self._results) 

272 

273 def __bool__(self) -> bool: 

274 return len(self._results) > 0 

275 

276 def append(self, item: CharsetMatch) -> None: 

277 """ 

278 Insert a single match. Will be inserted accordingly to preserve sort. 

279 Can be inserted as a submatch. 

280 """ 

281 if not isinstance(item, CharsetMatch): 

282 raise ValueError( 

283 "Cannot append instance '{}' to CharsetMatches".format( 

284 str(item.__class__) 

285 ) 

286 ) 

287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 

288 if len(item.raw) < TOO_BIG_SEQUENCE: 

289 for match in self._results: 

290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 

291 match.add_submatch(item) 

292 return 

293 self._results.append(item) 

294 self._results = sorted(self._results) 

295 

296 def best(self) -> Optional["CharsetMatch"]: 

297 """ 

298 Simply return the first match. Strict equivalent to matches[0]. 

299 """ 

300 if not self._results: 

301 return None 

302 return self._results[0] 

303 

304 def first(self) -> Optional["CharsetMatch"]: 

305 """ 

306 Redundant method, call the method best(). Kept for BC reasons. 

307 """ 

308 return self.best() 

309 

310 

311CoherenceMatch = Tuple[str, float] 

312CoherenceMatches = List[CoherenceMatch] 

313 

314 

315class CliDetectionResult: 

316 def __init__( 

317 self, 

318 path: str, 

319 encoding: Optional[str], 

320 encoding_aliases: List[str], 

321 alternative_encodings: List[str], 

322 language: str, 

323 alphabets: List[str], 

324 has_sig_or_bom: bool, 

325 chaos: float, 

326 coherence: float, 

327 unicode_path: Optional[str], 

328 is_preferred: bool, 

329 ): 

330 self.path: str = path 

331 self.unicode_path: Optional[str] = unicode_path 

332 self.encoding: Optional[str] = encoding 

333 self.encoding_aliases: List[str] = encoding_aliases 

334 self.alternative_encodings: List[str] = alternative_encodings 

335 self.language: str = language 

336 self.alphabets: List[str] = alphabets 

337 self.has_sig_or_bom: bool = has_sig_or_bom 

338 self.chaos: float = chaos 

339 self.coherence: float = coherence 

340 self.is_preferred: bool = is_preferred 

341 

342 @property 

343 def __dict__(self) -> Dict[str, Any]: # type: ignore 

344 return { 

345 "path": self.path, 

346 "encoding": self.encoding, 

347 "encoding_aliases": self.encoding_aliases, 

348 "alternative_encodings": self.alternative_encodings, 

349 "language": self.language, 

350 "alphabets": self.alphabets, 

351 "has_sig_or_bom": self.has_sig_or_bom, 

352 "chaos": self.chaos, 

353 "coherence": self.coherence, 

354 "unicode_path": self.unicode_path, 

355 "is_preferred": self.is_preferred, 

356 } 

357 

358 def to_json(self) -> str: 

359 return dumps(self.__dict__, ensure_ascii=True, indent=4)