Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/models.py: 37%
174 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:37 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:37 +0000
1from encodings.aliases import aliases
2from hashlib import sha256
3from json import dumps
4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
6from .constant import TOO_BIG_SEQUENCE
7from .utils import iana_name, is_multi_byte_encoding, unicode_range
10class CharsetMatch:
11 def __init__(
12 self,
13 payload: bytes,
14 guessed_encoding: str,
15 mean_mess_ratio: float,
16 has_sig_or_bom: bool,
17 languages: "CoherenceMatches",
18 decoded_payload: Optional[str] = None,
19 ):
20 self._payload: bytes = payload
22 self._encoding: str = guessed_encoding
23 self._mean_mess_ratio: float = mean_mess_ratio
24 self._languages: CoherenceMatches = languages
25 self._has_sig_or_bom: bool = has_sig_or_bom
26 self._unicode_ranges: Optional[List[str]] = None
28 self._leaves: List[CharsetMatch] = []
29 self._mean_coherence_ratio: float = 0.0
31 self._output_payload: Optional[bytes] = None
32 self._output_encoding: Optional[str] = None
34 self._string: Optional[str] = decoded_payload
36 def __eq__(self, other: object) -> bool:
37 if not isinstance(other, CharsetMatch):
38 raise TypeError(
39 "__eq__ cannot be invoked on {} and {}.".format(
40 str(other.__class__), str(self.__class__)
41 )
42 )
43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
45 def __lt__(self, other: object) -> bool:
46 """
47 Implemented to make sorted available upon CharsetMatches items.
48 """
49 if not isinstance(other, CharsetMatch):
50 raise ValueError
52 chaos_difference: float = abs(self.chaos - other.chaos)
53 coherence_difference: float = abs(self.coherence - other.coherence)
55 # Below 1% difference --> Use Coherence
56 if chaos_difference < 0.01 and coherence_difference > 0.02:
57 # When having a tough decision, use the result that decoded as many multi-byte as possible.
58 if chaos_difference == 0.0 and self.coherence == other.coherence:
59 return self.multi_byte_usage > other.multi_byte_usage
60 return self.coherence > other.coherence
62 return self.chaos < other.chaos
64 @property
65 def multi_byte_usage(self) -> float:
66 return 1.0 - len(str(self)) / len(self.raw)
68 def __str__(self) -> str:
69 # Lazy Str Loading
70 if self._string is None:
71 self._string = str(self._payload, self._encoding, "strict")
72 return self._string
74 def __repr__(self) -> str:
75 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
77 def add_submatch(self, other: "CharsetMatch") -> None:
78 if not isinstance(other, CharsetMatch) or other == self:
79 raise ValueError(
80 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
81 other.__class__
82 )
83 )
85 other._string = None # Unload RAM usage; dirty trick.
86 self._leaves.append(other)
88 @property
89 def encoding(self) -> str:
90 return self._encoding
92 @property
93 def encoding_aliases(self) -> List[str]:
94 """
95 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
96 """
97 also_known_as: List[str] = []
98 for u, p in aliases.items():
99 if self.encoding == u:
100 also_known_as.append(p)
101 elif self.encoding == p:
102 also_known_as.append(u)
103 return also_known_as
105 @property
106 def bom(self) -> bool:
107 return self._has_sig_or_bom
109 @property
110 def byte_order_mark(self) -> bool:
111 return self._has_sig_or_bom
113 @property
114 def languages(self) -> List[str]:
115 """
116 Return the complete list of possible languages found in decoded sequence.
117 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
118 """
119 return [e[0] for e in self._languages]
121 @property
122 def language(self) -> str:
123 """
124 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
125 "Unknown".
126 """
127 if not self._languages:
128 # Trying to infer the language based on the given encoding
129 # Its either English or we should not pronounce ourselves in certain cases.
130 if "ascii" in self.could_be_from_charset:
131 return "English"
133 # doing it there to avoid circular import
134 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
136 languages = (
137 mb_encoding_languages(self.encoding)
138 if is_multi_byte_encoding(self.encoding)
139 else encoding_languages(self.encoding)
140 )
142 if len(languages) == 0 or "Latin Based" in languages:
143 return "Unknown"
145 return languages[0]
147 return self._languages[0][0]
149 @property
150 def chaos(self) -> float:
151 return self._mean_mess_ratio
153 @property
154 def coherence(self) -> float:
155 if not self._languages:
156 return 0.0
157 return self._languages[0][1]
159 @property
160 def percent_chaos(self) -> float:
161 return round(self.chaos * 100, ndigits=3)
163 @property
164 def percent_coherence(self) -> float:
165 return round(self.coherence * 100, ndigits=3)
167 @property
168 def raw(self) -> bytes:
169 """
170 Original untouched bytes.
171 """
172 return self._payload
174 @property
175 def submatch(self) -> List["CharsetMatch"]:
176 return self._leaves
178 @property
179 def has_submatch(self) -> bool:
180 return len(self._leaves) > 0
182 @property
183 def alphabets(self) -> List[str]:
184 if self._unicode_ranges is not None:
185 return self._unicode_ranges
186 # list detected ranges
187 detected_ranges: List[Optional[str]] = [
188 unicode_range(char) for char in str(self)
189 ]
190 # filter and sort
191 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
192 return self._unicode_ranges
194 @property
195 def could_be_from_charset(self) -> List[str]:
196 """
197 The complete list of encoding that output the exact SAME str result and therefore could be the originating
198 encoding.
199 This list does include the encoding available in property 'encoding'.
200 """
201 return [self._encoding] + [m.encoding for m in self._leaves]
203 def output(self, encoding: str = "utf_8") -> bytes:
204 """
205 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
206 Any errors will be simply ignored by the encoder NOT replaced.
207 """
208 if self._output_encoding is None or self._output_encoding != encoding:
209 self._output_encoding = encoding
210 self._output_payload = str(self).encode(encoding, "replace")
212 return self._output_payload # type: ignore
214 @property
215 def fingerprint(self) -> str:
216 """
217 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
218 """
219 return sha256(self.output()).hexdigest()
222class CharsetMatches:
223 """
224 Container with every CharsetMatch items ordered by default from most probable to the less one.
225 Act like a list(iterable) but does not implements all related methods.
226 """
228 def __init__(self, results: Optional[List[CharsetMatch]] = None):
229 self._results: List[CharsetMatch] = sorted(results) if results else []
231 def __iter__(self) -> Iterator[CharsetMatch]:
232 yield from self._results
234 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
235 """
236 Retrieve a single item either by its position or encoding name (alias may be used here).
237 Raise KeyError upon invalid index or encoding not present in results.
238 """
239 if isinstance(item, int):
240 return self._results[item]
241 if isinstance(item, str):
242 item = iana_name(item, False)
243 for result in self._results:
244 if item in result.could_be_from_charset:
245 return result
246 raise KeyError
248 def __len__(self) -> int:
249 return len(self._results)
251 def __bool__(self) -> bool:
252 return len(self._results) > 0
254 def append(self, item: CharsetMatch) -> None:
255 """
256 Insert a single match. Will be inserted accordingly to preserve sort.
257 Can be inserted as a submatch.
258 """
259 if not isinstance(item, CharsetMatch):
260 raise ValueError(
261 "Cannot append instance '{}' to CharsetMatches".format(
262 str(item.__class__)
263 )
264 )
265 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
266 if len(item.raw) <= TOO_BIG_SEQUENCE:
267 for match in self._results:
268 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
269 match.add_submatch(item)
270 return
271 self._results.append(item)
272 self._results = sorted(self._results)
274 def best(self) -> Optional["CharsetMatch"]:
275 """
276 Simply return the first match. Strict equivalent to matches[0].
277 """
278 if not self._results:
279 return None
280 return self._results[0]
282 def first(self) -> Optional["CharsetMatch"]:
283 """
284 Redundant method, call the method best(). Kept for BC reasons.
285 """
286 return self.best()
289CoherenceMatch = Tuple[str, float]
290CoherenceMatches = List[CoherenceMatch]
293class CliDetectionResult:
294 def __init__(
295 self,
296 path: str,
297 encoding: Optional[str],
298 encoding_aliases: List[str],
299 alternative_encodings: List[str],
300 language: str,
301 alphabets: List[str],
302 has_sig_or_bom: bool,
303 chaos: float,
304 coherence: float,
305 unicode_path: Optional[str],
306 is_preferred: bool,
307 ):
308 self.path: str = path
309 self.unicode_path: Optional[str] = unicode_path
310 self.encoding: Optional[str] = encoding
311 self.encoding_aliases: List[str] = encoding_aliases
312 self.alternative_encodings: List[str] = alternative_encodings
313 self.language: str = language
314 self.alphabets: List[str] = alphabets
315 self.has_sig_or_bom: bool = has_sig_or_bom
316 self.chaos: float = chaos
317 self.coherence: float = coherence
318 self.is_preferred: bool = is_preferred
320 @property
321 def __dict__(self) -> Dict[str, Any]: # type: ignore
322 return {
323 "path": self.path,
324 "encoding": self.encoding,
325 "encoding_aliases": self.encoding_aliases,
326 "alternative_encodings": self.alternative_encodings,
327 "language": self.language,
328 "alphabets": self.alphabets,
329 "has_sig_or_bom": self.has_sig_or_bom,
330 "chaos": self.chaos,
331 "coherence": self.coherence,
332 "unicode_path": self.unicode_path,
333 "is_preferred": self.is_preferred,
334 }
336 def to_json(self) -> str:
337 return dumps(self.__dict__, ensure_ascii=True, indent=4)