1from encodings.aliases import aliases
2from hashlib import sha256
3from json import dumps
4from re import sub
5from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
6
7from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
8from .utils import iana_name, is_multi_byte_encoding, unicode_range
9
10
11class CharsetMatch:
12 def __init__(
13 self,
14 payload: bytes,
15 guessed_encoding: str,
16 mean_mess_ratio: float,
17 has_sig_or_bom: bool,
18 languages: "CoherenceMatches",
19 decoded_payload: Optional[str] = None,
20 preemptive_declaration: Optional[str] = None,
21 ):
22 self._payload: bytes = payload
23
24 self._encoding: str = guessed_encoding
25 self._mean_mess_ratio: float = mean_mess_ratio
26 self._languages: CoherenceMatches = languages
27 self._has_sig_or_bom: bool = has_sig_or_bom
28 self._unicode_ranges: Optional[List[str]] = None
29
30 self._leaves: List[CharsetMatch] = []
31 self._mean_coherence_ratio: float = 0.0
32
33 self._output_payload: Optional[bytes] = None
34 self._output_encoding: Optional[str] = None
35
36 self._string: Optional[str] = decoded_payload
37
38 self._preemptive_declaration: Optional[str] = preemptive_declaration
39
40 def __eq__(self, other: object) -> bool:
41 if not isinstance(other, CharsetMatch):
42 if isinstance(other, str):
43 return iana_name(other) == self.encoding
44 return False
45 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
46
47 def __lt__(self, other: object) -> bool:
48 """
49 Implemented to make sorted available upon CharsetMatches items.
50 """
51 if not isinstance(other, CharsetMatch):
52 raise ValueError
53
54 chaos_difference: float = abs(self.chaos - other.chaos)
55 coherence_difference: float = abs(self.coherence - other.coherence)
56
57 # Below 1% difference --> Use Coherence
58 if chaos_difference < 0.01 and coherence_difference > 0.02:
59 return self.coherence > other.coherence
60 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
61 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
62 # preserve RAM usage!
63 if len(self._payload) >= TOO_BIG_SEQUENCE:
64 return self.chaos < other.chaos
65 return self.multi_byte_usage > other.multi_byte_usage
66
67 return self.chaos < other.chaos
68
69 @property
70 def multi_byte_usage(self) -> float:
71 return 1.0 - (len(str(self)) / len(self.raw))
72
73 def __str__(self) -> str:
74 # Lazy Str Loading
75 if self._string is None:
76 self._string = str(self._payload, self._encoding, "strict")
77 return self._string
78
79 def __repr__(self) -> str:
80 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
81
82 def add_submatch(self, other: "CharsetMatch") -> None:
83 if not isinstance(other, CharsetMatch) or other == self:
84 raise ValueError(
85 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
86 other.__class__
87 )
88 )
89
90 other._string = None # Unload RAM usage; dirty trick.
91 self._leaves.append(other)
92
93 @property
94 def encoding(self) -> str:
95 return self._encoding
96
97 @property
98 def encoding_aliases(self) -> List[str]:
99 """
100 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
101 """
102 also_known_as: List[str] = []
103 for u, p in aliases.items():
104 if self.encoding == u:
105 also_known_as.append(p)
106 elif self.encoding == p:
107 also_known_as.append(u)
108 return also_known_as
109
110 @property
111 def bom(self) -> bool:
112 return self._has_sig_or_bom
113
114 @property
115 def byte_order_mark(self) -> bool:
116 return self._has_sig_or_bom
117
118 @property
119 def languages(self) -> List[str]:
120 """
121 Return the complete list of possible languages found in decoded sequence.
122 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
123 """
124 return [e[0] for e in self._languages]
125
126 @property
127 def language(self) -> str:
128 """
129 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
130 "Unknown".
131 """
132 if not self._languages:
133 # Trying to infer the language based on the given encoding
134 # Its either English or we should not pronounce ourselves in certain cases.
135 if "ascii" in self.could_be_from_charset:
136 return "English"
137
138 # doing it there to avoid circular import
139 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
140
141 languages = (
142 mb_encoding_languages(self.encoding)
143 if is_multi_byte_encoding(self.encoding)
144 else encoding_languages(self.encoding)
145 )
146
147 if len(languages) == 0 or "Latin Based" in languages:
148 return "Unknown"
149
150 return languages[0]
151
152 return self._languages[0][0]
153
154 @property
155 def chaos(self) -> float:
156 return self._mean_mess_ratio
157
158 @property
159 def coherence(self) -> float:
160 if not self._languages:
161 return 0.0
162 return self._languages[0][1]
163
164 @property
165 def percent_chaos(self) -> float:
166 return round(self.chaos * 100, ndigits=3)
167
168 @property
169 def percent_coherence(self) -> float:
170 return round(self.coherence * 100, ndigits=3)
171
172 @property
173 def raw(self) -> bytes:
174 """
175 Original untouched bytes.
176 """
177 return self._payload
178
179 @property
180 def submatch(self) -> List["CharsetMatch"]:
181 return self._leaves
182
183 @property
184 def has_submatch(self) -> bool:
185 return len(self._leaves) > 0
186
187 @property
188 def alphabets(self) -> List[str]:
189 if self._unicode_ranges is not None:
190 return self._unicode_ranges
191 # list detected ranges
192 detected_ranges: List[Optional[str]] = [
193 unicode_range(char) for char in str(self)
194 ]
195 # filter and sort
196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197 return self._unicode_ranges
198
199 @property
200 def could_be_from_charset(self) -> List[str]:
201 """
202 The complete list of encoding that output the exact SAME str result and therefore could be the originating
203 encoding.
204 This list does include the encoding available in property 'encoding'.
205 """
206 return [self._encoding] + [m.encoding for m in self._leaves]
207
208 def output(self, encoding: str = "utf_8") -> bytes:
209 """
210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211 Any errors will be simply ignored by the encoder NOT replaced.
212 """
213 if self._output_encoding is None or self._output_encoding != encoding:
214 self._output_encoding = encoding
215 decoded_string = str(self)
216 if (
217 self._preemptive_declaration is not None
218 and self._preemptive_declaration.lower()
219 not in ["utf-8", "utf8", "utf_8"]
220 ):
221 patched_header = sub(
222 RE_POSSIBLE_ENCODING_INDICATION,
223 lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224 m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
225 ),
226 decoded_string[:8192],
227 1,
228 )
229
230 decoded_string = patched_header + decoded_string[8192:]
231
232 self._output_payload = decoded_string.encode(encoding, "replace")
233
234 return self._output_payload # type: ignore
235
236 @property
237 def fingerprint(self) -> str:
238 """
239 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
240 """
241 return sha256(self.output()).hexdigest()
242
243
244class CharsetMatches:
245 """
246 Container with every CharsetMatch items ordered by default from most probable to the less one.
247 Act like a list(iterable) but does not implements all related methods.
248 """
249
250 def __init__(self, results: Optional[List[CharsetMatch]] = None):
251 self._results: List[CharsetMatch] = sorted(results) if results else []
252
253 def __iter__(self) -> Iterator[CharsetMatch]:
254 yield from self._results
255
256 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
257 """
258 Retrieve a single item either by its position or encoding name (alias may be used here).
259 Raise KeyError upon invalid index or encoding not present in results.
260 """
261 if isinstance(item, int):
262 return self._results[item]
263 if isinstance(item, str):
264 item = iana_name(item, False)
265 for result in self._results:
266 if item in result.could_be_from_charset:
267 return result
268 raise KeyError
269
270 def __len__(self) -> int:
271 return len(self._results)
272
273 def __bool__(self) -> bool:
274 return len(self._results) > 0
275
276 def append(self, item: CharsetMatch) -> None:
277 """
278 Insert a single match. Will be inserted accordingly to preserve sort.
279 Can be inserted as a submatch.
280 """
281 if not isinstance(item, CharsetMatch):
282 raise ValueError(
283 "Cannot append instance '{}' to CharsetMatches".format(
284 str(item.__class__)
285 )
286 )
287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
288 if len(item.raw) < TOO_BIG_SEQUENCE:
289 for match in self._results:
290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
291 match.add_submatch(item)
292 return
293 self._results.append(item)
294 self._results = sorted(self._results)
295
296 def best(self) -> Optional["CharsetMatch"]:
297 """
298 Simply return the first match. Strict equivalent to matches[0].
299 """
300 if not self._results:
301 return None
302 return self._results[0]
303
304 def first(self) -> Optional["CharsetMatch"]:
305 """
306 Redundant method, call the method best(). Kept for BC reasons.
307 """
308 return self.best()
309
310
311CoherenceMatch = Tuple[str, float]
312CoherenceMatches = List[CoherenceMatch]
313
314
315class CliDetectionResult:
316 def __init__(
317 self,
318 path: str,
319 encoding: Optional[str],
320 encoding_aliases: List[str],
321 alternative_encodings: List[str],
322 language: str,
323 alphabets: List[str],
324 has_sig_or_bom: bool,
325 chaos: float,
326 coherence: float,
327 unicode_path: Optional[str],
328 is_preferred: bool,
329 ):
330 self.path: str = path
331 self.unicode_path: Optional[str] = unicode_path
332 self.encoding: Optional[str] = encoding
333 self.encoding_aliases: List[str] = encoding_aliases
334 self.alternative_encodings: List[str] = alternative_encodings
335 self.language: str = language
336 self.alphabets: List[str] = alphabets
337 self.has_sig_or_bom: bool = has_sig_or_bom
338 self.chaos: float = chaos
339 self.coherence: float = coherence
340 self.is_preferred: bool = is_preferred
341
342 @property
343 def __dict__(self) -> Dict[str, Any]: # type: ignore
344 return {
345 "path": self.path,
346 "encoding": self.encoding,
347 "encoding_aliases": self.encoding_aliases,
348 "alternative_encodings": self.alternative_encodings,
349 "language": self.language,
350 "alphabets": self.alphabets,
351 "has_sig_or_bom": self.has_sig_or_bom,
352 "chaos": self.chaos,
353 "coherence": self.coherence,
354 "unicode_path": self.unicode_path,
355 "is_preferred": self.is_preferred,
356 }
357
358 def to_json(self) -> str:
359 return dumps(self.__dict__, ensure_ascii=True, indent=4)