1from __future__ import annotations
2
3from encodings.aliases import aliases
4from json import dumps
5from re import sub
6from typing import Any, Iterator, List, Tuple
7
8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
9from .utils import iana_name, is_multi_byte_encoding, unicode_range
10
11
12class CharsetMatch:
13 def __init__(
14 self,
15 payload: bytes | bytearray,
16 guessed_encoding: str,
17 mean_mess_ratio: float,
18 has_sig_or_bom: bool,
19 languages: CoherenceMatches,
20 decoded_payload: str | None = None,
21 preemptive_declaration: str | None = None,
22 ):
23 self._payload: bytes | bytearray = payload
24
25 self._encoding: str = guessed_encoding
26 self._mean_mess_ratio: float = mean_mess_ratio
27 self._languages: CoherenceMatches = languages
28 self._has_sig_or_bom: bool = has_sig_or_bom
29 self._unicode_ranges: list[str] | None = None
30
31 self._leaves: list[CharsetMatch] = []
32 self._mean_coherence_ratio: float = 0.0
33
34 self._output_payload: bytes | None = None
35 self._output_encoding: str | None = None
36
37 self._string: str | None = decoded_payload
38
39 self._preemptive_declaration: str | None = preemptive_declaration
40
41 def __eq__(self, other: object) -> bool:
42 if not isinstance(other, CharsetMatch):
43 if isinstance(other, str):
44 return iana_name(other) == self.encoding
45 return False
46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
47
48 def __lt__(self, other: object) -> bool:
49 """
50 Implemented to make sorted available upon CharsetMatches items.
51 """
52 if not isinstance(other, CharsetMatch):
53 raise ValueError
54
55 chaos_difference: float = abs(self.chaos - other.chaos)
56 coherence_difference: float = abs(self.coherence - other.coherence)
57
58 # Below 0.5% difference --> Use Coherence
59 if chaos_difference < 0.005 and coherence_difference > 0.02:
60 return self.coherence > other.coherence
61 elif chaos_difference < 0.005 and coherence_difference <= 0.02:
62 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
63 # preserve RAM usage!
64 if len(self._payload) >= TOO_BIG_SEQUENCE:
65 return self.chaos < other.chaos
66 return self.multi_byte_usage > other.multi_byte_usage
67
68 return self.chaos < other.chaos
69
70 @property
71 def multi_byte_usage(self) -> float:
72 return 1.0 - (len(str(self)) / len(self.raw))
73
74 def __str__(self) -> str:
75 # Lazy Str Loading
76 if self._string is None:
77 self._string = str(self._payload, self._encoding, "strict")
78 # UTF-7 BOM is encoded in modified Base64 whose byte boundary
79 # can overlap with the next character, so raw-byte stripping
80 # is unreliable. Strip the decoded BOM character instead.
81 if (
82 self._has_sig_or_bom
83 and self._encoding == "utf_7"
84 and self._string
85 and self._string[0] == "\ufeff"
86 ):
87 self._string = self._string[1:]
88 return self._string
89
90 def __repr__(self) -> str:
91 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"
92
93 def add_submatch(self, other: CharsetMatch) -> None:
94 if not isinstance(other, CharsetMatch) or other == self:
95 raise ValueError(
96 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
97 other.__class__
98 )
99 )
100
101 other._string = None # Unload RAM usage; dirty trick.
102 self._leaves.append(other)
103
104 @property
105 def encoding(self) -> str:
106 return self._encoding
107
108 @property
109 def encoding_aliases(self) -> list[str]:
110 """
111 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
112 """
113 also_known_as: list[str] = []
114 for u, p in aliases.items():
115 if self.encoding == u:
116 also_known_as.append(p)
117 elif self.encoding == p:
118 also_known_as.append(u)
119 return also_known_as
120
121 @property
122 def bom(self) -> bool:
123 return self._has_sig_or_bom
124
125 @property
126 def byte_order_mark(self) -> bool:
127 return self._has_sig_or_bom
128
129 @property
130 def languages(self) -> list[str]:
131 """
132 Return the complete list of possible languages found in decoded sequence.
133 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
134 """
135 return [e[0] for e in self._languages]
136
137 @property
138 def language(self) -> str:
139 """
140 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
141 "Unknown".
142 """
143 if not self._languages:
144 # Trying to infer the language based on the given encoding
145 # Its either English or we should not pronounce ourselves in certain cases.
146 if "ascii" in self.could_be_from_charset:
147 return "English"
148
149 # doing it there to avoid circular import
150 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
151
152 languages = (
153 mb_encoding_languages(self.encoding)
154 if is_multi_byte_encoding(self.encoding)
155 else encoding_languages(self.encoding)
156 )
157
158 if len(languages) == 0 or "Latin Based" in languages:
159 return "Unknown"
160
161 return languages[0]
162
163 return self._languages[0][0]
164
165 @property
166 def chaos(self) -> float:
167 return self._mean_mess_ratio
168
169 @property
170 def coherence(self) -> float:
171 if not self._languages:
172 return 0.0
173 return self._languages[0][1]
174
175 @property
176 def percent_chaos(self) -> float:
177 return round(self.chaos * 100, ndigits=3)
178
179 @property
180 def percent_coherence(self) -> float:
181 return round(self.coherence * 100, ndigits=3)
182
183 @property
184 def raw(self) -> bytes | bytearray:
185 """
186 Original untouched bytes.
187 """
188 return self._payload
189
190 @property
191 def submatch(self) -> list[CharsetMatch]:
192 return self._leaves
193
194 @property
195 def has_submatch(self) -> bool:
196 return len(self._leaves) > 0
197
198 @property
199 def alphabets(self) -> list[str]:
200 if self._unicode_ranges is not None:
201 return self._unicode_ranges
202 # list detected ranges
203 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
204 # filter and sort
205 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
206 return self._unicode_ranges
207
208 @property
209 def could_be_from_charset(self) -> list[str]:
210 """
211 The complete list of encoding that output the exact SAME str result and therefore could be the originating
212 encoding.
213 This list does include the encoding available in property 'encoding'.
214 """
215 return [self._encoding] + [m.encoding for m in self._leaves]
216
217 def output(self, encoding: str = "utf_8") -> bytes:
218 """
219 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
220 Any errors will be simply ignored by the encoder NOT replaced.
221 """
222 if self._output_encoding is None or self._output_encoding != encoding:
223 self._output_encoding = encoding
224 decoded_string = str(self)
225 if (
226 self._preemptive_declaration is not None
227 and self._preemptive_declaration.lower()
228 not in ["utf-8", "utf8", "utf_8"]
229 ):
230 patched_header = sub(
231 RE_POSSIBLE_ENCODING_INDICATION,
232 lambda m: m.string[m.span()[0] : m.span()[1]].replace(
233 m.groups()[0],
234 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
235 ),
236 decoded_string[:8192],
237 count=1,
238 )
239
240 decoded_string = patched_header + decoded_string[8192:]
241
242 self._output_payload = decoded_string.encode(encoding, "replace")
243
244 return self._output_payload # type: ignore
245
246 @property
247 def fingerprint(self) -> int:
248 """
249 Retrieve a hash fingerprint of the decoded payload, used for deduplication.
250 """
251 return hash(str(self))
252
253
254class CharsetMatches:
255 """
256 Container with every CharsetMatch items ordered by default from most probable to the less one.
257 Act like a list(iterable) but does not implements all related methods.
258 """
259
260 def __init__(self, results: list[CharsetMatch] | None = None):
261 self._results: list[CharsetMatch] = sorted(results) if results else []
262
263 def __iter__(self) -> Iterator[CharsetMatch]:
264 yield from self._results
265
266 def __getitem__(self, item: int | str) -> CharsetMatch:
267 """
268 Retrieve a single item either by its position or encoding name (alias may be used here).
269 Raise KeyError upon invalid index or encoding not present in results.
270 """
271 if isinstance(item, int):
272 return self._results[item]
273 if isinstance(item, str):
274 item = iana_name(item, False)
275 for result in self._results:
276 if item in result.could_be_from_charset:
277 return result
278 raise KeyError
279
280 def __len__(self) -> int:
281 return len(self._results)
282
283 def __bool__(self) -> bool:
284 return len(self._results) > 0
285
286 def append(self, item: CharsetMatch) -> None:
287 """
288 Insert a single match. Will be inserted accordingly to preserve sort.
289 Can be inserted as a submatch.
290 """
291 if not isinstance(item, CharsetMatch):
292 raise ValueError(
293 "Cannot append instance '{}' to CharsetMatches".format(
294 str(item.__class__)
295 )
296 )
297 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
298 if len(item.raw) < TOO_BIG_SEQUENCE:
299 for match in self._results:
300 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
301 match.add_submatch(item)
302 return
303 self._results.append(item)
304 self._results = sorted(self._results)
305
306 def best(self) -> CharsetMatch | None:
307 """
308 Simply return the first match. Strict equivalent to matches[0].
309 """
310 if not self._results:
311 return None
312 return self._results[0]
313
314 def first(self) -> CharsetMatch | None:
315 """
316 Redundant method, call the method best(). Kept for BC reasons.
317 """
318 return self.best()
319
320
321CoherenceMatch = Tuple[str, float]
322CoherenceMatches = List[CoherenceMatch]
323
324
325class CliDetectionResult:
326 def __init__(
327 self,
328 path: str,
329 encoding: str | None,
330 encoding_aliases: list[str],
331 alternative_encodings: list[str],
332 language: str,
333 alphabets: list[str],
334 has_sig_or_bom: bool,
335 chaos: float,
336 coherence: float,
337 unicode_path: str | None,
338 is_preferred: bool,
339 ):
340 self.path: str = path
341 self.unicode_path: str | None = unicode_path
342 self.encoding: str | None = encoding
343 self.encoding_aliases: list[str] = encoding_aliases
344 self.alternative_encodings: list[str] = alternative_encodings
345 self.language: str = language
346 self.alphabets: list[str] = alphabets
347 self.has_sig_or_bom: bool = has_sig_or_bom
348 self.chaos: float = chaos
349 self.coherence: float = coherence
350 self.is_preferred: bool = is_preferred
351
352 @property
353 def __dict__(self) -> dict[str, Any]: # type: ignore
354 return {
355 "path": self.path,
356 "encoding": self.encoding,
357 "encoding_aliases": self.encoding_aliases,
358 "alternative_encodings": self.alternative_encodings,
359 "language": self.language,
360 "alphabets": self.alphabets,
361 "has_sig_or_bom": self.has_sig_or_bom,
362 "chaos": self.chaos,
363 "coherence": self.coherence,
364 "unicode_path": self.unicode_path,
365 "is_preferred": self.is_preferred,
366 }
367
368 def to_json(self) -> str:
369 return dumps(self.__dict__, ensure_ascii=True, indent=4)