1from __future__ import annotations
2
3from encodings.aliases import aliases
4from hashlib import sha256
5from json import dumps
6from re import sub
7from typing import Any, Iterator, List, Tuple
8
9from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
12
13class CharsetMatch:
14 def __init__(
15 self,
16 payload: bytes,
17 guessed_encoding: str,
18 mean_mess_ratio: float,
19 has_sig_or_bom: bool,
20 languages: CoherenceMatches,
21 decoded_payload: str | None = None,
22 preemptive_declaration: str | None = None,
23 ):
24 self._payload: bytes = payload
25
26 self._encoding: str = guessed_encoding
27 self._mean_mess_ratio: float = mean_mess_ratio
28 self._languages: CoherenceMatches = languages
29 self._has_sig_or_bom: bool = has_sig_or_bom
30 self._unicode_ranges: list[str] | None = None
31
32 self._leaves: list[CharsetMatch] = []
33 self._mean_coherence_ratio: float = 0.0
34
35 self._output_payload: bytes | None = None
36 self._output_encoding: str | None = None
37
38 self._string: str | None = decoded_payload
39
40 self._preemptive_declaration: str | None = preemptive_declaration
41
42 def __eq__(self, other: object) -> bool:
43 if not isinstance(other, CharsetMatch):
44 if isinstance(other, str):
45 return iana_name(other) == self.encoding
46 return False
47 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
49 def __lt__(self, other: object) -> bool:
50 """
51 Implemented to make sorted available upon CharsetMatches items.
52 """
53 if not isinstance(other, CharsetMatch):
54 raise ValueError
55
56 chaos_difference: float = abs(self.chaos - other.chaos)
57 coherence_difference: float = abs(self.coherence - other.coherence)
58
59 # Below 1% difference --> Use Coherence
60 if chaos_difference < 0.01 and coherence_difference > 0.02:
61 return self.coherence > other.coherence
62 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64 # preserve RAM usage!
65 if len(self._payload) >= TOO_BIG_SEQUENCE:
66 return self.chaos < other.chaos
67 return self.multi_byte_usage > other.multi_byte_usage
68
69 return self.chaos < other.chaos
70
71 @property
72 def multi_byte_usage(self) -> float:
73 return 1.0 - (len(str(self)) / len(self.raw))
74
75 def __str__(self) -> str:
76 # Lazy Str Loading
77 if self._string is None:
78 self._string = str(self._payload, self._encoding, "strict")
79 return self._string
80
81 def __repr__(self) -> str:
82 return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
84 def add_submatch(self, other: CharsetMatch) -> None:
85 if not isinstance(other, CharsetMatch) or other == self:
86 raise ValueError(
87 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88 other.__class__
89 )
90 )
91
92 other._string = None # Unload RAM usage; dirty trick.
93 self._leaves.append(other)
94
95 @property
96 def encoding(self) -> str:
97 return self._encoding
98
99 @property
100 def encoding_aliases(self) -> list[str]:
101 """
102 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103 """
104 also_known_as: list[str] = []
105 for u, p in aliases.items():
106 if self.encoding == u:
107 also_known_as.append(p)
108 elif self.encoding == p:
109 also_known_as.append(u)
110 return also_known_as
111
112 @property
113 def bom(self) -> bool:
114 return self._has_sig_or_bom
115
116 @property
117 def byte_order_mark(self) -> bool:
118 return self._has_sig_or_bom
119
120 @property
121 def languages(self) -> list[str]:
122 """
123 Return the complete list of possible languages found in decoded sequence.
124 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125 """
126 return [e[0] for e in self._languages]
127
128 @property
129 def language(self) -> str:
130 """
131 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132 "Unknown".
133 """
134 if not self._languages:
135 # Trying to infer the language based on the given encoding
136 # Its either English or we should not pronounce ourselves in certain cases.
137 if "ascii" in self.could_be_from_charset:
138 return "English"
139
140 # doing it there to avoid circular import
141 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
143 languages = (
144 mb_encoding_languages(self.encoding)
145 if is_multi_byte_encoding(self.encoding)
146 else encoding_languages(self.encoding)
147 )
148
149 if len(languages) == 0 or "Latin Based" in languages:
150 return "Unknown"
151
152 return languages[0]
153
154 return self._languages[0][0]
155
156 @property
157 def chaos(self) -> float:
158 return self._mean_mess_ratio
159
160 @property
161 def coherence(self) -> float:
162 if not self._languages:
163 return 0.0
164 return self._languages[0][1]
165
166 @property
167 def percent_chaos(self) -> float:
168 return round(self.chaos * 100, ndigits=3)
169
170 @property
171 def percent_coherence(self) -> float:
172 return round(self.coherence * 100, ndigits=3)
173
174 @property
175 def raw(self) -> bytes:
176 """
177 Original untouched bytes.
178 """
179 return self._payload
180
181 @property
182 def submatch(self) -> list[CharsetMatch]:
183 return self._leaves
184
185 @property
186 def has_submatch(self) -> bool:
187 return len(self._leaves) > 0
188
189 @property
190 def alphabets(self) -> list[str]:
191 if self._unicode_ranges is not None:
192 return self._unicode_ranges
193 # list detected ranges
194 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195 # filter and sort
196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197 return self._unicode_ranges
198
199 @property
200 def could_be_from_charset(self) -> list[str]:
201 """
202 The complete list of encoding that output the exact SAME str result and therefore could be the originating
203 encoding.
204 This list does include the encoding available in property 'encoding'.
205 """
206 return [self._encoding] + [m.encoding for m in self._leaves]
207
208 def output(self, encoding: str = "utf_8") -> bytes:
209 """
210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211 Any errors will be simply ignored by the encoder NOT replaced.
212 """
213 if self._output_encoding is None or self._output_encoding != encoding:
214 self._output_encoding = encoding
215 decoded_string = str(self)
216 if (
217 self._preemptive_declaration is not None
218 and self._preemptive_declaration.lower()
219 not in ["utf-8", "utf8", "utf_8"]
220 ):
221 patched_header = sub(
222 RE_POSSIBLE_ENCODING_INDICATION,
223 lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224 m.groups()[0],
225 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226 ),
227 decoded_string[:8192],
228 count=1,
229 )
230
231 decoded_string = patched_header + decoded_string[8192:]
232
233 self._output_payload = decoded_string.encode(encoding, "replace")
234
235 return self._output_payload # type: ignore
236
237 @property
238 def fingerprint(self) -> str:
239 """
240 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241 """
242 return sha256(self.output()).hexdigest()
243
244
245class CharsetMatches:
246 """
247 Container with every CharsetMatch items ordered by default from most probable to the less one.
248 Act like a list(iterable) but does not implements all related methods.
249 """
250
251 def __init__(self, results: list[CharsetMatch] | None = None):
252 self._results: list[CharsetMatch] = sorted(results) if results else []
253
254 def __iter__(self) -> Iterator[CharsetMatch]:
255 yield from self._results
256
257 def __getitem__(self, item: int | str) -> CharsetMatch:
258 """
259 Retrieve a single item either by its position or encoding name (alias may be used here).
260 Raise KeyError upon invalid index or encoding not present in results.
261 """
262 if isinstance(item, int):
263 return self._results[item]
264 if isinstance(item, str):
265 item = iana_name(item, False)
266 for result in self._results:
267 if item in result.could_be_from_charset:
268 return result
269 raise KeyError
270
271 def __len__(self) -> int:
272 return len(self._results)
273
274 def __bool__(self) -> bool:
275 return len(self._results) > 0
276
277 def append(self, item: CharsetMatch) -> None:
278 """
279 Insert a single match. Will be inserted accordingly to preserve sort.
280 Can be inserted as a submatch.
281 """
282 if not isinstance(item, CharsetMatch):
283 raise ValueError(
284 "Cannot append instance '{}' to CharsetMatches".format(
285 str(item.__class__)
286 )
287 )
288 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289 if len(item.raw) < TOO_BIG_SEQUENCE:
290 for match in self._results:
291 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292 match.add_submatch(item)
293 return
294 self._results.append(item)
295 self._results = sorted(self._results)
296
297 def best(self) -> CharsetMatch | None:
298 """
299 Simply return the first match. Strict equivalent to matches[0].
300 """
301 if not self._results:
302 return None
303 return self._results[0]
304
305 def first(self) -> CharsetMatch | None:
306 """
307 Redundant method, call the method best(). Kept for BC reasons.
308 """
309 return self.best()
310
311
312CoherenceMatch = Tuple[str, float]
313CoherenceMatches = List[CoherenceMatch]
314
315
316class CliDetectionResult:
317 def __init__(
318 self,
319 path: str,
320 encoding: str | None,
321 encoding_aliases: list[str],
322 alternative_encodings: list[str],
323 language: str,
324 alphabets: list[str],
325 has_sig_or_bom: bool,
326 chaos: float,
327 coherence: float,
328 unicode_path: str | None,
329 is_preferred: bool,
330 ):
331 self.path: str = path
332 self.unicode_path: str | None = unicode_path
333 self.encoding: str | None = encoding
334 self.encoding_aliases: list[str] = encoding_aliases
335 self.alternative_encodings: list[str] = alternative_encodings
336 self.language: str = language
337 self.alphabets: list[str] = alphabets
338 self.has_sig_or_bom: bool = has_sig_or_bom
339 self.chaos: float = chaos
340 self.coherence: float = coherence
341 self.is_preferred: bool = is_preferred
342
343 @property
344 def __dict__(self) -> dict[str, Any]: # type: ignore
345 return {
346 "path": self.path,
347 "encoding": self.encoding,
348 "encoding_aliases": self.encoding_aliases,
349 "alternative_encodings": self.alternative_encodings,
350 "language": self.language,
351 "alphabets": self.alphabets,
352 "has_sig_or_bom": self.has_sig_or_bom,
353 "chaos": self.chaos,
354 "coherence": self.coherence,
355 "unicode_path": self.unicode_path,
356 "is_preferred": self.is_preferred,
357 }
358
359 def to_json(self) -> str:
360 return dumps(self.__dict__, ensure_ascii=True, indent=4)