1from __future__ import annotations
2
3from encodings.aliases import aliases
4from json import dumps
5from re import sub
6from typing import Any, Iterator, List, Tuple
7
8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
9from .utils import iana_name, is_multi_byte_encoding, unicode_range
10
11
12class CharsetMatch:
13 def __init__(
14 self,
15 payload: bytes,
16 guessed_encoding: str,
17 mean_mess_ratio: float,
18 has_sig_or_bom: bool,
19 languages: CoherenceMatches,
20 decoded_payload: str | None = None,
21 preemptive_declaration: str | None = None,
22 ):
23 self._payload: bytes = payload
24
25 self._encoding: str = guessed_encoding
26 self._mean_mess_ratio: float = mean_mess_ratio
27 self._languages: CoherenceMatches = languages
28 self._has_sig_or_bom: bool = has_sig_or_bom
29 self._unicode_ranges: list[str] | None = None
30
31 self._leaves: list[CharsetMatch] = []
32 self._mean_coherence_ratio: float = 0.0
33
34 self._output_payload: bytes | None = None
35 self._output_encoding: str | None = None
36
37 self._string: str | None = decoded_payload
38
39 self._preemptive_declaration: str | None = preemptive_declaration
40
41 def __eq__(self, other: object) -> bool:
42 if not isinstance(other, CharsetMatch):
43 if isinstance(other, str):
44 return iana_name(other) == self.encoding
45 return False
46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
47
48 def __lt__(self, other: object) -> bool:
49 """
50 Implemented to make sorted available upon CharsetMatches items.
51 """
52 if not isinstance(other, CharsetMatch):
53 raise ValueError
54
55 chaos_difference: float = abs(self.chaos - other.chaos)
56 coherence_difference: float = abs(self.coherence - other.coherence)
57
58 # Below 1% difference --> Use Coherence
59 if chaos_difference < 0.01 and coherence_difference > 0.02:
60 return self.coherence > other.coherence
61 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
62 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
63 # preserve RAM usage!
64 if len(self._payload) >= TOO_BIG_SEQUENCE:
65 return self.chaos < other.chaos
66 return self.multi_byte_usage > other.multi_byte_usage
67
68 return self.chaos < other.chaos
69
70 @property
71 def multi_byte_usage(self) -> float:
72 return 1.0 - (len(str(self)) / len(self.raw))
73
74 def __str__(self) -> str:
75 # Lazy Str Loading
76 if self._string is None:
77 self._string = str(self._payload, self._encoding, "strict")
78 return self._string
79
80 def __repr__(self) -> str:
81 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"
82
83 def add_submatch(self, other: CharsetMatch) -> None:
84 if not isinstance(other, CharsetMatch) or other == self:
85 raise ValueError(
86 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
87 other.__class__
88 )
89 )
90
91 other._string = None # Unload RAM usage; dirty trick.
92 self._leaves.append(other)
93
94 @property
95 def encoding(self) -> str:
96 return self._encoding
97
98 @property
99 def encoding_aliases(self) -> list[str]:
100 """
101 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
102 """
103 also_known_as: list[str] = []
104 for u, p in aliases.items():
105 if self.encoding == u:
106 also_known_as.append(p)
107 elif self.encoding == p:
108 also_known_as.append(u)
109 return also_known_as
110
111 @property
112 def bom(self) -> bool:
113 return self._has_sig_or_bom
114
115 @property
116 def byte_order_mark(self) -> bool:
117 return self._has_sig_or_bom
118
119 @property
120 def languages(self) -> list[str]:
121 """
122 Return the complete list of possible languages found in decoded sequence.
123 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
124 """
125 return [e[0] for e in self._languages]
126
127 @property
128 def language(self) -> str:
129 """
130 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
131 "Unknown".
132 """
133 if not self._languages:
134 # Trying to infer the language based on the given encoding
135 # Its either English or we should not pronounce ourselves in certain cases.
136 if "ascii" in self.could_be_from_charset:
137 return "English"
138
139 # doing it there to avoid circular import
140 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
141
142 languages = (
143 mb_encoding_languages(self.encoding)
144 if is_multi_byte_encoding(self.encoding)
145 else encoding_languages(self.encoding)
146 )
147
148 if len(languages) == 0 or "Latin Based" in languages:
149 return "Unknown"
150
151 return languages[0]
152
153 return self._languages[0][0]
154
155 @property
156 def chaos(self) -> float:
157 return self._mean_mess_ratio
158
159 @property
160 def coherence(self) -> float:
161 if not self._languages:
162 return 0.0
163 return self._languages[0][1]
164
165 @property
166 def percent_chaos(self) -> float:
167 return round(self.chaos * 100, ndigits=3)
168
169 @property
170 def percent_coherence(self) -> float:
171 return round(self.coherence * 100, ndigits=3)
172
173 @property
174 def raw(self) -> bytes:
175 """
176 Original untouched bytes.
177 """
178 return self._payload
179
180 @property
181 def submatch(self) -> list[CharsetMatch]:
182 return self._leaves
183
184 @property
185 def has_submatch(self) -> bool:
186 return len(self._leaves) > 0
187
188 @property
189 def alphabets(self) -> list[str]:
190 if self._unicode_ranges is not None:
191 return self._unicode_ranges
192 # list detected ranges
193 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
194 # filter and sort
195 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
196 return self._unicode_ranges
197
198 @property
199 def could_be_from_charset(self) -> list[str]:
200 """
201 The complete list of encoding that output the exact SAME str result and therefore could be the originating
202 encoding.
203 This list does include the encoding available in property 'encoding'.
204 """
205 return [self._encoding] + [m.encoding for m in self._leaves]
206
207 def output(self, encoding: str = "utf_8") -> bytes:
208 """
209 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
210 Any errors will be simply ignored by the encoder NOT replaced.
211 """
212 if self._output_encoding is None or self._output_encoding != encoding:
213 self._output_encoding = encoding
214 decoded_string = str(self)
215 if (
216 self._preemptive_declaration is not None
217 and self._preemptive_declaration.lower()
218 not in ["utf-8", "utf8", "utf_8"]
219 ):
220 patched_header = sub(
221 RE_POSSIBLE_ENCODING_INDICATION,
222 lambda m: m.string[m.span()[0] : m.span()[1]].replace(
223 m.groups()[0],
224 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
225 ),
226 decoded_string[:8192],
227 count=1,
228 )
229
230 decoded_string = patched_header + decoded_string[8192:]
231
232 self._output_payload = decoded_string.encode(encoding, "replace")
233
234 return self._output_payload # type: ignore
235
236 @property
237 def fingerprint(self) -> int:
238 """
239 Retrieve a hash fingerprint of the decoded payload, used for deduplication.
240 """
241 return hash(str(self))
242
243
244class CharsetMatches:
245 """
246 Container with every CharsetMatch items ordered by default from most probable to the less one.
247 Act like a list(iterable) but does not implements all related methods.
248 """
249
250 def __init__(self, results: list[CharsetMatch] | None = None):
251 self._results: list[CharsetMatch] = sorted(results) if results else []
252
253 def __iter__(self) -> Iterator[CharsetMatch]:
254 yield from self._results
255
256 def __getitem__(self, item: int | str) -> CharsetMatch:
257 """
258 Retrieve a single item either by its position or encoding name (alias may be used here).
259 Raise KeyError upon invalid index or encoding not present in results.
260 """
261 if isinstance(item, int):
262 return self._results[item]
263 if isinstance(item, str):
264 item = iana_name(item, False)
265 for result in self._results:
266 if item in result.could_be_from_charset:
267 return result
268 raise KeyError
269
270 def __len__(self) -> int:
271 return len(self._results)
272
273 def __bool__(self) -> bool:
274 return len(self._results) > 0
275
276 def append(self, item: CharsetMatch) -> None:
277 """
278 Insert a single match. Will be inserted accordingly to preserve sort.
279 Can be inserted as a submatch.
280 """
281 if not isinstance(item, CharsetMatch):
282 raise ValueError(
283 "Cannot append instance '{}' to CharsetMatches".format(
284 str(item.__class__)
285 )
286 )
287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
288 if len(item.raw) < TOO_BIG_SEQUENCE:
289 for match in self._results:
290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
291 match.add_submatch(item)
292 return
293 self._results.append(item)
294 self._results = sorted(self._results)
295
296 def best(self) -> CharsetMatch | None:
297 """
298 Simply return the first match. Strict equivalent to matches[0].
299 """
300 if not self._results:
301 return None
302 return self._results[0]
303
304 def first(self) -> CharsetMatch | None:
305 """
306 Redundant method, call the method best(). Kept for BC reasons.
307 """
308 return self.best()
309
310
311CoherenceMatch = Tuple[str, float]
312CoherenceMatches = List[CoherenceMatch]
313
314
315class CliDetectionResult:
316 def __init__(
317 self,
318 path: str,
319 encoding: str | None,
320 encoding_aliases: list[str],
321 alternative_encodings: list[str],
322 language: str,
323 alphabets: list[str],
324 has_sig_or_bom: bool,
325 chaos: float,
326 coherence: float,
327 unicode_path: str | None,
328 is_preferred: bool,
329 ):
330 self.path: str = path
331 self.unicode_path: str | None = unicode_path
332 self.encoding: str | None = encoding
333 self.encoding_aliases: list[str] = encoding_aliases
334 self.alternative_encodings: list[str] = alternative_encodings
335 self.language: str = language
336 self.alphabets: list[str] = alphabets
337 self.has_sig_or_bom: bool = has_sig_or_bom
338 self.chaos: float = chaos
339 self.coherence: float = coherence
340 self.is_preferred: bool = is_preferred
341
342 @property
343 def __dict__(self) -> dict[str, Any]: # type: ignore
344 return {
345 "path": self.path,
346 "encoding": self.encoding,
347 "encoding_aliases": self.encoding_aliases,
348 "alternative_encodings": self.alternative_encodings,
349 "language": self.language,
350 "alphabets": self.alphabets,
351 "has_sig_or_bom": self.has_sig_or_bom,
352 "chaos": self.chaos,
353 "coherence": self.coherence,
354 "unicode_path": self.unicode_path,
355 "is_preferred": self.is_preferred,
356 }
357
358 def to_json(self) -> str:
359 return dumps(self.__dict__, ensure_ascii=True, indent=4)