1from encodings.aliases import aliases
2from hashlib import sha256
3from json import dumps
4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
5
6from .constant import TOO_BIG_SEQUENCE
7from .utils import iana_name, is_multi_byte_encoding, unicode_range
8
9
10class CharsetMatch:
11 def __init__(
12 self,
13 payload: bytes,
14 guessed_encoding: str,
15 mean_mess_ratio: float,
16 has_sig_or_bom: bool,
17 languages: "CoherenceMatches",
18 decoded_payload: Optional[str] = None,
19 ):
20 self._payload: bytes = payload
21
22 self._encoding: str = guessed_encoding
23 self._mean_mess_ratio: float = mean_mess_ratio
24 self._languages: CoherenceMatches = languages
25 self._has_sig_or_bom: bool = has_sig_or_bom
26 self._unicode_ranges: Optional[List[str]] = None
27
28 self._leaves: List[CharsetMatch] = []
29 self._mean_coherence_ratio: float = 0.0
30
31 self._output_payload: Optional[bytes] = None
32 self._output_encoding: Optional[str] = None
33
34 self._string: Optional[str] = decoded_payload
35
36 def __eq__(self, other: object) -> bool:
37 if not isinstance(other, CharsetMatch):
38 if isinstance(other, str):
39 return iana_name(other) == self.encoding
40 return False
41 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
42
43 def __lt__(self, other: object) -> bool:
44 """
45 Implemented to make sorted available upon CharsetMatches items.
46 """
47 if not isinstance(other, CharsetMatch):
48 raise ValueError
49
50 chaos_difference: float = abs(self.chaos - other.chaos)
51 coherence_difference: float = abs(self.coherence - other.coherence)
52
53 # Below 1% difference --> Use Coherence
54 if chaos_difference < 0.01 and coherence_difference > 0.02:
55 return self.coherence > other.coherence
56 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
57 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
58 # preserve RAM usage!
59 if len(self._payload) >= TOO_BIG_SEQUENCE:
60 return self.chaos < other.chaos
61 return self.multi_byte_usage > other.multi_byte_usage
62
63 return self.chaos < other.chaos
64
65 @property
66 def multi_byte_usage(self) -> float:
67 return 1.0 - (len(str(self)) / len(self.raw))
68
69 def __str__(self) -> str:
70 # Lazy Str Loading
71 if self._string is None:
72 self._string = str(self._payload, self._encoding, "strict")
73 return self._string
74
75 def __repr__(self) -> str:
76 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
77
78 def add_submatch(self, other: "CharsetMatch") -> None:
79 if not isinstance(other, CharsetMatch) or other == self:
80 raise ValueError(
81 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
82 other.__class__
83 )
84 )
85
86 other._string = None # Unload RAM usage; dirty trick.
87 self._leaves.append(other)
88
89 @property
90 def encoding(self) -> str:
91 return self._encoding
92
93 @property
94 def encoding_aliases(self) -> List[str]:
95 """
96 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
97 """
98 also_known_as: List[str] = []
99 for u, p in aliases.items():
100 if self.encoding == u:
101 also_known_as.append(p)
102 elif self.encoding == p:
103 also_known_as.append(u)
104 return also_known_as
105
106 @property
107 def bom(self) -> bool:
108 return self._has_sig_or_bom
109
110 @property
111 def byte_order_mark(self) -> bool:
112 return self._has_sig_or_bom
113
114 @property
115 def languages(self) -> List[str]:
116 """
117 Return the complete list of possible languages found in decoded sequence.
118 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
119 """
120 return [e[0] for e in self._languages]
121
122 @property
123 def language(self) -> str:
124 """
125 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
126 "Unknown".
127 """
128 if not self._languages:
129 # Trying to infer the language based on the given encoding
130 # Its either English or we should not pronounce ourselves in certain cases.
131 if "ascii" in self.could_be_from_charset:
132 return "English"
133
134 # doing it there to avoid circular import
135 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
136
137 languages = (
138 mb_encoding_languages(self.encoding)
139 if is_multi_byte_encoding(self.encoding)
140 else encoding_languages(self.encoding)
141 )
142
143 if len(languages) == 0 or "Latin Based" in languages:
144 return "Unknown"
145
146 return languages[0]
147
148 return self._languages[0][0]
149
150 @property
151 def chaos(self) -> float:
152 return self._mean_mess_ratio
153
154 @property
155 def coherence(self) -> float:
156 if not self._languages:
157 return 0.0
158 return self._languages[0][1]
159
160 @property
161 def percent_chaos(self) -> float:
162 return round(self.chaos * 100, ndigits=3)
163
164 @property
165 def percent_coherence(self) -> float:
166 return round(self.coherence * 100, ndigits=3)
167
168 @property
169 def raw(self) -> bytes:
170 """
171 Original untouched bytes.
172 """
173 return self._payload
174
175 @property
176 def submatch(self) -> List["CharsetMatch"]:
177 return self._leaves
178
179 @property
180 def has_submatch(self) -> bool:
181 return len(self._leaves) > 0
182
183 @property
184 def alphabets(self) -> List[str]:
185 if self._unicode_ranges is not None:
186 return self._unicode_ranges
187 # list detected ranges
188 detected_ranges: List[Optional[str]] = [
189 unicode_range(char) for char in str(self)
190 ]
191 # filter and sort
192 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
193 return self._unicode_ranges
194
195 @property
196 def could_be_from_charset(self) -> List[str]:
197 """
198 The complete list of encoding that output the exact SAME str result and therefore could be the originating
199 encoding.
200 This list does include the encoding available in property 'encoding'.
201 """
202 return [self._encoding] + [m.encoding for m in self._leaves]
203
204 def output(self, encoding: str = "utf_8") -> bytes:
205 """
206 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
207 Any errors will be simply ignored by the encoder NOT replaced.
208 """
209 if self._output_encoding is None or self._output_encoding != encoding:
210 self._output_encoding = encoding
211 self._output_payload = str(self).encode(encoding, "replace")
212
213 return self._output_payload # type: ignore
214
215 @property
216 def fingerprint(self) -> str:
217 """
218 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
219 """
220 return sha256(self.output()).hexdigest()
221
222
223class CharsetMatches:
224 """
225 Container with every CharsetMatch items ordered by default from most probable to the less one.
226 Act like a list(iterable) but does not implements all related methods.
227 """
228
229 def __init__(self, results: Optional[List[CharsetMatch]] = None):
230 self._results: List[CharsetMatch] = sorted(results) if results else []
231
232 def __iter__(self) -> Iterator[CharsetMatch]:
233 yield from self._results
234
235 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
236 """
237 Retrieve a single item either by its position or encoding name (alias may be used here).
238 Raise KeyError upon invalid index or encoding not present in results.
239 """
240 if isinstance(item, int):
241 return self._results[item]
242 if isinstance(item, str):
243 item = iana_name(item, False)
244 for result in self._results:
245 if item in result.could_be_from_charset:
246 return result
247 raise KeyError
248
249 def __len__(self) -> int:
250 return len(self._results)
251
252 def __bool__(self) -> bool:
253 return len(self._results) > 0
254
255 def append(self, item: CharsetMatch) -> None:
256 """
257 Insert a single match. Will be inserted accordingly to preserve sort.
258 Can be inserted as a submatch.
259 """
260 if not isinstance(item, CharsetMatch):
261 raise ValueError(
262 "Cannot append instance '{}' to CharsetMatches".format(
263 str(item.__class__)
264 )
265 )
266 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
267 if len(item.raw) <= TOO_BIG_SEQUENCE:
268 for match in self._results:
269 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
270 match.add_submatch(item)
271 return
272 self._results.append(item)
273 self._results = sorted(self._results)
274
275 def best(self) -> Optional["CharsetMatch"]:
276 """
277 Simply return the first match. Strict equivalent to matches[0].
278 """
279 if not self._results:
280 return None
281 return self._results[0]
282
283 def first(self) -> Optional["CharsetMatch"]:
284 """
285 Redundant method, call the method best(). Kept for BC reasons.
286 """
287 return self.best()
288
289
290CoherenceMatch = Tuple[str, float]
291CoherenceMatches = List[CoherenceMatch]
292
293
294class CliDetectionResult:
295 def __init__(
296 self,
297 path: str,
298 encoding: Optional[str],
299 encoding_aliases: List[str],
300 alternative_encodings: List[str],
301 language: str,
302 alphabets: List[str],
303 has_sig_or_bom: bool,
304 chaos: float,
305 coherence: float,
306 unicode_path: Optional[str],
307 is_preferred: bool,
308 ):
309 self.path: str = path
310 self.unicode_path: Optional[str] = unicode_path
311 self.encoding: Optional[str] = encoding
312 self.encoding_aliases: List[str] = encoding_aliases
313 self.alternative_encodings: List[str] = alternative_encodings
314 self.language: str = language
315 self.alphabets: List[str] = alphabets
316 self.has_sig_or_bom: bool = has_sig_or_bom
317 self.chaos: float = chaos
318 self.coherence: float = coherence
319 self.is_preferred: bool = is_preferred
320
321 @property
322 def __dict__(self) -> Dict[str, Any]: # type: ignore
323 return {
324 "path": self.path,
325 "encoding": self.encoding,
326 "encoding_aliases": self.encoding_aliases,
327 "alternative_encodings": self.alternative_encodings,
328 "language": self.language,
329 "alphabets": self.alphabets,
330 "has_sig_or_bom": self.has_sig_or_bom,
331 "chaos": self.chaos,
332 "coherence": self.coherence,
333 "unicode_path": self.unicode_path,
334 "is_preferred": self.is_preferred,
335 }
336
337 def to_json(self) -> str:
338 return dumps(self.__dict__, ensure_ascii=True, indent=4)