Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/cd.py: 13%
188 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:51 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:51 +0000
1import importlib
2from codecs import IncrementalDecoder
3from collections import Counter
4from functools import lru_cache
5from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
7from .constant import (
8 FREQUENCIES,
9 KO_NAMES,
10 LANGUAGE_SUPPORTED_COUNT,
11 TOO_SMALL_SEQUENCE,
12 ZH_NAMES,
13)
14from .md import is_suspiciously_successive_range
15from .models import CoherenceMatches
16from .utils import (
17 is_accentuated,
18 is_latin,
19 is_multi_byte_encoding,
20 is_unicode_range_secondary,
21 unicode_range,
22)
25def encoding_unicode_range(iana_name: str) -> List[str]:
26 """
27 Return associated unicode ranges in a single byte code page.
28 """
29 if is_multi_byte_encoding(iana_name):
30 raise IOError("Function not supported on multi-byte code page")
32 decoder = importlib.import_module(
33 "encodings.{}".format(iana_name)
34 ).IncrementalDecoder
36 p: IncrementalDecoder = decoder(errors="ignore")
37 seen_ranges: Dict[str, int] = {}
38 character_count: int = 0
40 for i in range(0x40, 0xFF):
41 chunk: str = p.decode(bytes([i]))
43 if chunk:
44 character_range: Optional[str] = unicode_range(chunk)
46 if character_range is None:
47 continue
49 if is_unicode_range_secondary(character_range) is False:
50 if character_range not in seen_ranges:
51 seen_ranges[character_range] = 0
52 seen_ranges[character_range] += 1
53 character_count += 1
55 return sorted(
56 [
57 character_range
58 for character_range in seen_ranges
59 if seen_ranges[character_range] / character_count >= 0.15
60 ]
61 )
64def unicode_range_languages(primary_range: str) -> List[str]:
65 """
66 Return inferred languages used with a unicode range.
67 """
68 languages: List[str] = []
70 for language, characters in FREQUENCIES.items():
71 for character in characters:
72 if unicode_range(character) == primary_range:
73 languages.append(language)
74 break
76 return languages
79@lru_cache()
80def encoding_languages(iana_name: str) -> List[str]:
81 """
82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83 This function does the correspondence.
84 """
85 unicode_ranges: List[str] = encoding_unicode_range(iana_name)
86 primary_range: Optional[str] = None
88 for specified_range in unicode_ranges:
89 if "Latin" not in specified_range:
90 primary_range = specified_range
91 break
93 if primary_range is None:
94 return ["Latin Based"]
96 return unicode_range_languages(primary_range)
99@lru_cache()
100def mb_encoding_languages(iana_name: str) -> List[str]:
101 """
102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103 This function does the correspondence.
104 """
105 if (
106 iana_name.startswith("shift_")
107 or iana_name.startswith("iso2022_jp")
108 or iana_name.startswith("euc_j")
109 or iana_name == "cp932"
110 ):
111 return ["Japanese"]
112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113 return ["Chinese"]
114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115 return ["Korean"]
117 return []
120@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121def get_target_features(language: str) -> Tuple[bool, bool]:
122 """
123 Determine main aspects from a supported language if it contains accents and if is pure Latin.
124 """
125 target_have_accents: bool = False
126 target_pure_latin: bool = True
128 for character in FREQUENCIES[language]:
129 if not target_have_accents and is_accentuated(character):
130 target_have_accents = True
131 if target_pure_latin and is_latin(character) is False:
132 target_pure_latin = False
134 return target_have_accents, target_pure_latin
137def alphabet_languages(
138 characters: List[str], ignore_non_latin: bool = False
139) -> List[str]:
140 """
141 Return associated languages associated to given characters.
142 """
143 languages: List[Tuple[str, float]] = []
145 source_have_accents = any(is_accentuated(character) for character in characters)
147 for language, language_characters in FREQUENCIES.items():
148 target_have_accents, target_pure_latin = get_target_features(language)
150 if ignore_non_latin and target_pure_latin is False:
151 continue
153 if target_have_accents is False and source_have_accents:
154 continue
156 character_count: int = len(language_characters)
158 character_match_count: int = len(
159 [c for c in language_characters if c in characters]
160 )
162 ratio: float = character_match_count / character_count
164 if ratio >= 0.2:
165 languages.append((language, ratio))
167 languages = sorted(languages, key=lambda x: x[1], reverse=True)
169 return [compatible_language[0] for compatible_language in languages]
172def characters_popularity_compare(
173 language: str, ordered_characters: List[str]
174) -> float:
175 """
176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179 """
180 if language not in FREQUENCIES:
181 raise ValueError("{} not available".format(language))
183 character_approved_count: int = 0
184 FREQUENCIES_language_set = set(FREQUENCIES[language])
186 ordered_characters_count: int = len(ordered_characters)
187 target_language_characters_count: int = len(FREQUENCIES[language])
189 large_alphabet: bool = target_language_characters_count > 26
191 for character, character_rank in zip(
192 ordered_characters, range(0, ordered_characters_count)
193 ):
194 if character not in FREQUENCIES_language_set:
195 continue
197 character_rank_in_language: int = FREQUENCIES[language].index(character)
198 expected_projection_ratio: float = (
199 target_language_characters_count / ordered_characters_count
200 )
201 character_rank_projection: int = int(character_rank * expected_projection_ratio)
203 if (
204 large_alphabet is False
205 and abs(character_rank_projection - character_rank_in_language) > 4
206 ):
207 continue
209 if (
210 large_alphabet is True
211 and abs(character_rank_projection - character_rank_in_language)
212 < target_language_characters_count / 3
213 ):
214 character_approved_count += 1
215 continue
217 characters_before_source: List[str] = FREQUENCIES[language][
218 0:character_rank_in_language
219 ]
220 characters_after_source: List[str] = FREQUENCIES[language][
221 character_rank_in_language:
222 ]
223 characters_before: List[str] = ordered_characters[0:character_rank]
224 characters_after: List[str] = ordered_characters[character_rank:]
226 before_match_count: int = len(
227 set(characters_before) & set(characters_before_source)
228 )
230 after_match_count: int = len(
231 set(characters_after) & set(characters_after_source)
232 )
234 if len(characters_before_source) == 0 and before_match_count <= 4:
235 character_approved_count += 1
236 continue
238 if len(characters_after_source) == 0 and after_match_count <= 4:
239 character_approved_count += 1
240 continue
242 if (
243 before_match_count / len(characters_before_source) >= 0.4
244 or after_match_count / len(characters_after_source) >= 0.4
245 ):
246 character_approved_count += 1
247 continue
249 return character_approved_count / len(ordered_characters)
252def alpha_unicode_split(decoded_sequence: str) -> List[str]:
253 """
254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256 One containing the latin letters and the other hebrew.
257 """
258 layers: Dict[str, str] = {}
260 for character in decoded_sequence:
261 if character.isalpha() is False:
262 continue
264 character_range: Optional[str] = unicode_range(character)
266 if character_range is None:
267 continue
269 layer_target_range: Optional[str] = None
271 for discovered_range in layers:
272 if (
273 is_suspiciously_successive_range(discovered_range, character_range)
274 is False
275 ):
276 layer_target_range = discovered_range
277 break
279 if layer_target_range is None:
280 layer_target_range = character_range
282 if layer_target_range not in layers:
283 layers[layer_target_range] = character.lower()
284 continue
286 layers[layer_target_range] += character.lower()
288 return list(layers.values())
291def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
292 """
293 This function merge results previously given by the function coherence_ratio.
294 The return type is the same as coherence_ratio.
295 """
296 per_language_ratios: Dict[str, List[float]] = {}
297 for result in results:
298 for sub_result in result:
299 language, ratio = sub_result
300 if language not in per_language_ratios:
301 per_language_ratios[language] = [ratio]
302 continue
303 per_language_ratios[language].append(ratio)
305 merge = [
306 (
307 language,
308 round(
309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310 4,
311 ),
312 )
313 for language in per_language_ratios
314 ]
316 return sorted(merge, key=lambda x: x[1], reverse=True)
319def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320 """
321 We shall NOT return "English—" in CoherenceMatches because it is an alternative
322 of "English". This function only keeps the best match and remove the em-dash in it.
323 """
324 index_results: Dict[str, List[float]] = dict()
326 for result in results:
327 language, ratio = result
328 no_em_name: str = language.replace("—", "")
330 if no_em_name not in index_results:
331 index_results[no_em_name] = []
333 index_results[no_em_name].append(ratio)
335 if any(len(index_results[e]) > 1 for e in index_results):
336 filtered_results: CoherenceMatches = []
338 for language in index_results:
339 filtered_results.append((language, max(index_results[language])))
341 return filtered_results
343 return results
346@lru_cache(maxsize=2048)
347def coherence_ratio(
348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
349) -> CoherenceMatches:
350 """
351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352 A layer = Character extraction by alphabets/ranges.
353 """
355 results: List[Tuple[str, float]] = []
356 ignore_non_latin: bool = False
358 sufficient_match_count: int = 0
360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361 if "Latin Based" in lg_inclusion_list:
362 ignore_non_latin = True
363 lg_inclusion_list.remove("Latin Based")
365 for layer in alpha_unicode_split(decoded_sequence):
366 sequence_frequencies: TypeCounter[str] = Counter(layer)
367 most_common = sequence_frequencies.most_common()
369 character_count: int = sum(o for c, o in most_common)
371 if character_count <= TOO_SMALL_SEQUENCE:
372 continue
374 popular_character_ordered: List[str] = [c for c, o in most_common]
376 for language in lg_inclusion_list or alphabet_languages(
377 popular_character_ordered, ignore_non_latin
378 ):
379 ratio: float = characters_popularity_compare(
380 language, popular_character_ordered
381 )
383 if ratio < threshold:
384 continue
385 elif ratio >= 0.8:
386 sufficient_match_count += 1
388 results.append((language, round(ratio, 4)))
390 if sufficient_match_count >= 3:
391 break
393 return sorted(
394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395 )