1from __future__ import annotations
2
3import importlib
4from codecs import IncrementalDecoder
5from collections import Counter
6from functools import lru_cache
7from typing import Counter as TypeCounter
8
9from .constant import (
10 FREQUENCIES,
11 KO_NAMES,
12 LANGUAGE_SUPPORTED_COUNT,
13 TOO_SMALL_SEQUENCE,
14 ZH_NAMES,
15 _FREQUENCIES_SET,
16 _FREQUENCIES_RANK,
17)
18from .md import is_suspiciously_successive_range
19from .models import CoherenceMatches
20from .utils import (
21 is_accentuated,
22 is_latin,
23 is_multi_byte_encoding,
24 is_unicode_range_secondary,
25 unicode_range,
26)
27
28
29def encoding_unicode_range(iana_name: str) -> list[str]:
30 """
31 Return associated unicode ranges in a single byte code page.
32 """
33 if is_multi_byte_encoding(iana_name):
34 raise OSError( # Defensive:
35 "Function not supported on multi-byte code page"
36 )
37
38 decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
39
40 p: IncrementalDecoder = decoder(errors="ignore")
41 seen_ranges: dict[str, int] = {}
42 character_count: int = 0
43
44 for i in range(0x40, 0xFF):
45 chunk: str = p.decode(bytes([i]))
46
47 if chunk:
48 character_range: str | None = unicode_range(chunk)
49
50 if character_range is None:
51 continue
52
53 if is_unicode_range_secondary(character_range) is False:
54 if character_range not in seen_ranges:
55 seen_ranges[character_range] = 0
56 seen_ranges[character_range] += 1
57 character_count += 1
58
59 return sorted(
60 [
61 character_range
62 for character_range in seen_ranges
63 if seen_ranges[character_range] / character_count >= 0.15
64 ]
65 )
66
67
68def unicode_range_languages(primary_range: str) -> list[str]:
69 """
70 Return inferred languages used with a unicode range.
71 """
72 languages: list[str] = []
73
74 for language, characters in FREQUENCIES.items():
75 for character in characters:
76 if unicode_range(character) == primary_range:
77 languages.append(language)
78 break
79
80 return languages
81
82
83@lru_cache()
84def encoding_languages(iana_name: str) -> list[str]:
85 """
86 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
87 This function does the correspondence.
88 """
89 unicode_ranges: list[str] = encoding_unicode_range(iana_name)
90 primary_range: str | None = None
91
92 for specified_range in unicode_ranges:
93 if "Latin" not in specified_range:
94 primary_range = specified_range
95 break
96
97 if primary_range is None:
98 return ["Latin Based"]
99
100 return unicode_range_languages(primary_range)
101
102
103@lru_cache()
104def mb_encoding_languages(iana_name: str) -> list[str]:
105 """
106 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
107 This function does the correspondence.
108 """
109 if (
110 iana_name.startswith("shift_")
111 or iana_name.startswith("iso2022_jp")
112 or iana_name.startswith("euc_j")
113 or iana_name == "cp932"
114 ):
115 return ["Japanese"]
116 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
117 return ["Chinese"]
118 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
119 return ["Korean"]
120
121 return []
122
123
124@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
125def get_target_features(language: str) -> tuple[bool, bool]:
126 """
127 Determine main aspects from a supported language if it contains accents and if is pure Latin.
128 """
129 target_have_accents: bool = False
130 target_pure_latin: bool = True
131
132 for character in FREQUENCIES[language]:
133 if not target_have_accents and is_accentuated(character):
134 target_have_accents = True
135 if target_pure_latin and is_latin(character) is False:
136 target_pure_latin = False
137
138 return target_have_accents, target_pure_latin
139
140
141def alphabet_languages(
142 characters: list[str], ignore_non_latin: bool = False
143) -> list[str]:
144 """
145 Return associated languages associated to given characters.
146 """
147 languages: list[tuple[str, float]] = []
148
149 characters_set: frozenset[str] = frozenset(characters)
150 source_have_accents = any(is_accentuated(character) for character in characters)
151
152 for language, language_characters in FREQUENCIES.items():
153 target_have_accents, target_pure_latin = get_target_features(language)
154
155 if ignore_non_latin and target_pure_latin is False:
156 continue
157
158 if target_have_accents is False and source_have_accents:
159 continue
160
161 character_count: int = len(language_characters)
162
163 character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
164
165 ratio: float = character_match_count / character_count
166
167 if ratio >= 0.2:
168 languages.append((language, ratio))
169
170 languages = sorted(languages, key=lambda x: x[1], reverse=True)
171
172 return [compatible_language[0] for compatible_language in languages]
173
174
175def characters_popularity_compare(
176 language: str, ordered_characters: list[str]
177) -> float:
178 """
179 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
180 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
181 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
182 """
183 if language not in FREQUENCIES:
184 raise ValueError(f"{language} not available") # Defensive:
185
186 character_approved_count: int = 0
187 frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
188 lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
189
190 ordered_characters_count: int = len(ordered_characters)
191 target_language_characters_count: int = len(FREQUENCIES[language])
192
193 large_alphabet: bool = target_language_characters_count > 26
194
195 expected_projection_ratio: float = (
196 target_language_characters_count / ordered_characters_count
197 )
198
199 # Pre-built rank dict for ordered_characters (avoids repeated list slicing).
200 ordered_rank: dict[str, int] = {
201 char: rank for rank, char in enumerate(ordered_characters)
202 }
203
204 # Pre-compute characters common to both orderings.
205 # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
206 common_chars: list[tuple[int, int]] = [
207 (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
208 ]
209
210 # Pre-extract lr and orr arrays for faster iteration in the inner loop.
211 # Plain integer loops with local arrays are much faster under mypyc than
212 # generator expression sums over a list of tuples.
213 common_count: int = len(common_chars)
214 common_lr: list[int] = [p[0] for p in common_chars]
215 common_orr: list[int] = [p[1] for p in common_chars]
216
217 for character, character_rank in zip(
218 ordered_characters, range(0, ordered_characters_count)
219 ):
220 if character not in frequencies_language_set:
221 continue
222
223 character_rank_in_language: int = lang_rank[character]
224 character_rank_projection: int = int(character_rank * expected_projection_ratio)
225
226 if (
227 large_alphabet is False
228 and abs(character_rank_projection - character_rank_in_language) > 4
229 ):
230 continue
231
232 if (
233 large_alphabet is True
234 and abs(character_rank_projection - character_rank_in_language)
235 < target_language_characters_count / 3
236 ):
237 character_approved_count += 1
238 continue
239
240 # Count how many characters appear "before" in both orderings,
241 # and how many appear "at or after" in both orderings.
242 # Single pass over pre-extracted arrays — much faster under mypyc
243 # than two generator expression sums.
244 before_match_count: int = 0
245 after_match_count: int = 0
246 for i in range(common_count):
247 lr_i: int = common_lr[i]
248 orr_i: int = common_orr[i]
249 if lr_i < character_rank_in_language:
250 if orr_i < character_rank:
251 before_match_count += 1
252 else:
253 if orr_i >= character_rank:
254 after_match_count += 1
255
256 after_len: int = target_language_characters_count - character_rank_in_language
257
258 if character_rank_in_language == 0 and before_match_count <= 4:
259 character_approved_count += 1
260 continue
261
262 if after_len == 0 and after_match_count <= 4:
263 character_approved_count += 1
264 continue
265
266 if (
267 character_rank_in_language > 0
268 and before_match_count / character_rank_in_language >= 0.4
269 ) or (after_len > 0 and after_match_count / after_len >= 0.4):
270 character_approved_count += 1
271 continue
272
273 return character_approved_count / len(ordered_characters)
274
275
276def alpha_unicode_split(decoded_sequence: str) -> list[str]:
277 """
278 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
279 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
280 One containing the latin letters and the other hebrew.
281 """
282 layers: dict[str, list[str]] = {}
283
284 # Fast path: track single-layer key to skip dict iteration for single-script text.
285 single_layer_key: str | None = None
286 multi_layer: bool = False
287
288 # Cache the last character_range and its resolved layer to avoid repeated
289 # is_suspiciously_successive_range calls for consecutive same-range chars.
290 prev_character_range: str | None = None
291 prev_layer_target: str | None = None
292
293 for character in decoded_sequence:
294 if character.isalpha() is False:
295 continue
296
297 # ASCII fast-path: a-z and A-Z are always "Basic Latin".
298 # Avoids unicode_range() function call overhead for the most common case.
299 character_ord: int = ord(character)
300 if character_ord < 128:
301 character_range: str | None = "Basic Latin"
302 else:
303 character_range = unicode_range(character)
304
305 if character_range is None:
306 continue
307
308 # Fast path: same range as previous character → reuse cached layer target.
309 if character_range == prev_character_range:
310 if prev_layer_target is not None:
311 layers[prev_layer_target].append(character)
312 continue
313
314 layer_target_range: str | None = None
315
316 if multi_layer:
317 for discovered_range in layers:
318 if (
319 is_suspiciously_successive_range(discovered_range, character_range)
320 is False
321 ):
322 layer_target_range = discovered_range
323 break
324 elif single_layer_key is not None:
325 if (
326 is_suspiciously_successive_range(single_layer_key, character_range)
327 is False
328 ):
329 layer_target_range = single_layer_key
330
331 if layer_target_range is None:
332 layer_target_range = character_range
333
334 if layer_target_range not in layers:
335 layers[layer_target_range] = []
336 if single_layer_key is None:
337 single_layer_key = layer_target_range
338 else:
339 multi_layer = True
340
341 layers[layer_target_range].append(character)
342
343 # Cache for next iteration
344 prev_character_range = character_range
345 prev_layer_target = layer_target_range
346
347 return ["".join(chars).lower() for chars in layers.values()]
348
349
350def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
351 """
352 This function merge results previously given by the function coherence_ratio.
353 The return type is the same as coherence_ratio.
354 """
355 per_language_ratios: dict[str, list[float]] = {}
356 for result in results:
357 for sub_result in result:
358 language, ratio = sub_result
359 if language not in per_language_ratios:
360 per_language_ratios[language] = [ratio]
361 continue
362 per_language_ratios[language].append(ratio)
363
364 merge = [
365 (
366 language,
367 round(
368 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
369 4,
370 ),
371 )
372 for language in per_language_ratios
373 ]
374
375 return sorted(merge, key=lambda x: x[1], reverse=True)
376
377
378def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
379 """
380 We shall NOT return "English—" in CoherenceMatches because it is an alternative
381 of "English". This function only keeps the best match and remove the em-dash in it.
382 """
383 index_results: dict[str, list[float]] = dict()
384
385 for result in results:
386 language, ratio = result
387 no_em_name: str = language.replace("—", "")
388
389 if no_em_name not in index_results:
390 index_results[no_em_name] = []
391
392 index_results[no_em_name].append(ratio)
393
394 if any(len(index_results[e]) > 1 for e in index_results):
395 filtered_results: CoherenceMatches = []
396
397 for language in index_results:
398 filtered_results.append((language, max(index_results[language])))
399
400 return filtered_results
401
402 return results
403
404
405@lru_cache(maxsize=2048)
406def coherence_ratio(
407 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
408) -> CoherenceMatches:
409 """
410 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
411 A layer = Character extraction by alphabets/ranges.
412 """
413
414 results: list[tuple[str, float]] = []
415 ignore_non_latin: bool = False
416
417 sufficient_match_count: int = 0
418
419 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
420 if "Latin Based" in lg_inclusion_list:
421 ignore_non_latin = True
422 lg_inclusion_list.remove("Latin Based")
423
424 for layer in alpha_unicode_split(decoded_sequence):
425 sequence_frequencies: TypeCounter[str] = Counter(layer)
426 most_common = sequence_frequencies.most_common()
427
428 character_count: int = len(layer)
429
430 if character_count <= TOO_SMALL_SEQUENCE:
431 continue
432
433 popular_character_ordered: list[str] = [c for c, o in most_common]
434
435 for language in lg_inclusion_list or alphabet_languages(
436 popular_character_ordered, ignore_non_latin
437 ):
438 ratio: float = characters_popularity_compare(
439 language, popular_character_ordered
440 )
441
442 if ratio < threshold:
443 continue
444 elif ratio >= 0.8:
445 sufficient_match_count += 1
446
447 results.append((language, round(ratio, 4)))
448
449 if sufficient_match_count >= 3:
450 break
451
452 return sorted(
453 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
454 )