1from __future__ import annotations 
    2 
    3import importlib 
    4from codecs import IncrementalDecoder 
    5from collections import Counter 
    6from functools import lru_cache 
    7from typing import Counter as TypeCounter 
    8 
    9from .constant import ( 
    10    FREQUENCIES, 
    11    KO_NAMES, 
    12    LANGUAGE_SUPPORTED_COUNT, 
    13    TOO_SMALL_SEQUENCE, 
    14    ZH_NAMES, 
    15) 
    16from .md import is_suspiciously_successive_range 
    17from .models import CoherenceMatches 
    18from .utils import ( 
    19    is_accentuated, 
    20    is_latin, 
    21    is_multi_byte_encoding, 
    22    is_unicode_range_secondary, 
    23    unicode_range, 
    24) 
    25 
    26 
    27def encoding_unicode_range(iana_name: str) -> list[str]: 
    28    """ 
    29    Return associated unicode ranges in a single byte code page. 
    30    """ 
    31    if is_multi_byte_encoding(iana_name): 
    32        raise OSError("Function not supported on multi-byte code page") 
    33 
    34    decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder 
    35 
    36    p: IncrementalDecoder = decoder(errors="ignore") 
    37    seen_ranges: dict[str, int] = {} 
    38    character_count: int = 0 
    39 
    40    for i in range(0x40, 0xFF): 
    41        chunk: str = p.decode(bytes([i])) 
    42 
    43        if chunk: 
    44            character_range: str | None = unicode_range(chunk) 
    45 
    46            if character_range is None: 
    47                continue 
    48 
    49            if is_unicode_range_secondary(character_range) is False: 
    50                if character_range not in seen_ranges: 
    51                    seen_ranges[character_range] = 0 
    52                seen_ranges[character_range] += 1 
    53                character_count += 1 
    54 
    55    return sorted( 
    56        [ 
    57            character_range 
    58            for character_range in seen_ranges 
    59            if seen_ranges[character_range] / character_count >= 0.15 
    60        ] 
    61    ) 
    62 
    63 
    64def unicode_range_languages(primary_range: str) -> list[str]: 
    65    """ 
    66    Return inferred languages used with a unicode range. 
    67    """ 
    68    languages: list[str] = [] 
    69 
    70    for language, characters in FREQUENCIES.items(): 
    71        for character in characters: 
    72            if unicode_range(character) == primary_range: 
    73                languages.append(language) 
    74                break 
    75 
    76    return languages 
    77 
    78 
    79@lru_cache() 
    80def encoding_languages(iana_name: str) -> list[str]: 
    81    """ 
    82    Single-byte encoding language association. Some code page are heavily linked to particular language(s). 
    83    This function does the correspondence. 
    84    """ 
    85    unicode_ranges: list[str] = encoding_unicode_range(iana_name) 
    86    primary_range: str | None = None 
    87 
    88    for specified_range in unicode_ranges: 
    89        if "Latin" not in specified_range: 
    90            primary_range = specified_range 
    91            break 
    92 
    93    if primary_range is None: 
    94        return ["Latin Based"] 
    95 
    96    return unicode_range_languages(primary_range) 
    97 
    98 
    99@lru_cache() 
    100def mb_encoding_languages(iana_name: str) -> list[str]: 
    101    """ 
    102    Multi-byte encoding language association. Some code page are heavily linked to particular language(s). 
    103    This function does the correspondence. 
    104    """ 
    105    if ( 
    106        iana_name.startswith("shift_") 
    107        or iana_name.startswith("iso2022_jp") 
    108        or iana_name.startswith("euc_j") 
    109        or iana_name == "cp932" 
    110    ): 
    111        return ["Japanese"] 
    112    if iana_name.startswith("gb") or iana_name in ZH_NAMES: 
    113        return ["Chinese"] 
    114    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: 
    115        return ["Korean"] 
    116 
    117    return [] 
    118 
    119 
    120@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) 
    121def get_target_features(language: str) -> tuple[bool, bool]: 
    122    """ 
    123    Determine main aspects from a supported language if it contains accents and if is pure Latin. 
    124    """ 
    125    target_have_accents: bool = False 
    126    target_pure_latin: bool = True 
    127 
    128    for character in FREQUENCIES[language]: 
    129        if not target_have_accents and is_accentuated(character): 
    130            target_have_accents = True 
    131        if target_pure_latin and is_latin(character) is False: 
    132            target_pure_latin = False 
    133 
    134    return target_have_accents, target_pure_latin 
    135 
    136 
    137def alphabet_languages( 
    138    characters: list[str], ignore_non_latin: bool = False 
    139) -> list[str]: 
    140    """ 
    141    Return associated languages associated to given characters. 
    142    """ 
    143    languages: list[tuple[str, float]] = [] 
    144 
    145    source_have_accents = any(is_accentuated(character) for character in characters) 
    146 
    147    for language, language_characters in FREQUENCIES.items(): 
    148        target_have_accents, target_pure_latin = get_target_features(language) 
    149 
    150        if ignore_non_latin and target_pure_latin is False: 
    151            continue 
    152 
    153        if target_have_accents is False and source_have_accents: 
    154            continue 
    155 
    156        character_count: int = len(language_characters) 
    157 
    158        character_match_count: int = len( 
    159            [c for c in language_characters if c in characters] 
    160        ) 
    161 
    162        ratio: float = character_match_count / character_count 
    163 
    164        if ratio >= 0.2: 
    165            languages.append((language, ratio)) 
    166 
    167    languages = sorted(languages, key=lambda x: x[1], reverse=True) 
    168 
    169    return [compatible_language[0] for compatible_language in languages] 
    170 
    171 
    172def characters_popularity_compare( 
    173    language: str, ordered_characters: list[str] 
    174) -> float: 
    175    """ 
    176    Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. 
    177    The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). 
    178    Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) 
    179    """ 
    180    if language not in FREQUENCIES: 
    181        raise ValueError(f"{language} not available") 
    182 
    183    character_approved_count: int = 0 
    184    FREQUENCIES_language_set = set(FREQUENCIES[language]) 
    185 
    186    ordered_characters_count: int = len(ordered_characters) 
    187    target_language_characters_count: int = len(FREQUENCIES[language]) 
    188 
    189    large_alphabet: bool = target_language_characters_count > 26 
    190 
    191    for character, character_rank in zip( 
    192        ordered_characters, range(0, ordered_characters_count) 
    193    ): 
    194        if character not in FREQUENCIES_language_set: 
    195            continue 
    196 
    197        character_rank_in_language: int = FREQUENCIES[language].index(character) 
    198        expected_projection_ratio: float = ( 
    199            target_language_characters_count / ordered_characters_count 
    200        ) 
    201        character_rank_projection: int = int(character_rank * expected_projection_ratio) 
    202 
    203        if ( 
    204            large_alphabet is False 
    205            and abs(character_rank_projection - character_rank_in_language) > 4 
    206        ): 
    207            continue 
    208 
    209        if ( 
    210            large_alphabet is True 
    211            and abs(character_rank_projection - character_rank_in_language) 
    212            < target_language_characters_count / 3 
    213        ): 
    214            character_approved_count += 1 
    215            continue 
    216 
    217        characters_before_source: list[str] = FREQUENCIES[language][ 
    218            0:character_rank_in_language 
    219        ] 
    220        characters_after_source: list[str] = FREQUENCIES[language][ 
    221            character_rank_in_language: 
    222        ] 
    223        characters_before: list[str] = ordered_characters[0:character_rank] 
    224        characters_after: list[str] = ordered_characters[character_rank:] 
    225 
    226        before_match_count: int = len( 
    227            set(characters_before) & set(characters_before_source) 
    228        ) 
    229 
    230        after_match_count: int = len( 
    231            set(characters_after) & set(characters_after_source) 
    232        ) 
    233 
    234        if len(characters_before_source) == 0 and before_match_count <= 4: 
    235            character_approved_count += 1 
    236            continue 
    237 
    238        if len(characters_after_source) == 0 and after_match_count <= 4: 
    239            character_approved_count += 1 
    240            continue 
    241 
    242        if ( 
    243            before_match_count / len(characters_before_source) >= 0.4 
    244            or after_match_count / len(characters_after_source) >= 0.4 
    245        ): 
    246            character_approved_count += 1 
    247            continue 
    248 
    249    return character_approved_count / len(ordered_characters) 
    250 
    251 
    252def alpha_unicode_split(decoded_sequence: str) -> list[str]: 
    253    """ 
    254    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. 
    255    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; 
    256    One containing the latin letters and the other hebrew. 
    257    """ 
    258    layers: dict[str, str] = {} 
    259 
    260    for character in decoded_sequence: 
    261        if character.isalpha() is False: 
    262            continue 
    263 
    264        character_range: str | None = unicode_range(character) 
    265 
    266        if character_range is None: 
    267            continue 
    268 
    269        layer_target_range: str | None = None 
    270 
    271        for discovered_range in layers: 
    272            if ( 
    273                is_suspiciously_successive_range(discovered_range, character_range) 
    274                is False 
    275            ): 
    276                layer_target_range = discovered_range 
    277                break 
    278 
    279        if layer_target_range is None: 
    280            layer_target_range = character_range 
    281 
    282        if layer_target_range not in layers: 
    283            layers[layer_target_range] = character.lower() 
    284            continue 
    285 
    286        layers[layer_target_range] += character.lower() 
    287 
    288    return list(layers.values()) 
    289 
    290 
    291def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: 
    292    """ 
    293    This function merge results previously given by the function coherence_ratio. 
    294    The return type is the same as coherence_ratio. 
    295    """ 
    296    per_language_ratios: dict[str, list[float]] = {} 
    297    for result in results: 
    298        for sub_result in result: 
    299            language, ratio = sub_result 
    300            if language not in per_language_ratios: 
    301                per_language_ratios[language] = [ratio] 
    302                continue 
    303            per_language_ratios[language].append(ratio) 
    304 
    305    merge = [ 
    306        ( 
    307            language, 
    308            round( 
    309                sum(per_language_ratios[language]) / len(per_language_ratios[language]), 
    310                4, 
    311            ), 
    312        ) 
    313        for language in per_language_ratios 
    314    ] 
    315 
    316    return sorted(merge, key=lambda x: x[1], reverse=True) 
    317 
    318 
    319def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: 
    320    """ 
    321    We shall NOT return "English—" in CoherenceMatches because it is an alternative 
    322    of "English". This function only keeps the best match and remove the em-dash in it. 
    323    """ 
    324    index_results: dict[str, list[float]] = dict() 
    325 
    326    for result in results: 
    327        language, ratio = result 
    328        no_em_name: str = language.replace("—", "") 
    329 
    330        if no_em_name not in index_results: 
    331            index_results[no_em_name] = [] 
    332 
    333        index_results[no_em_name].append(ratio) 
    334 
    335    if any(len(index_results[e]) > 1 for e in index_results): 
    336        filtered_results: CoherenceMatches = [] 
    337 
    338        for language in index_results: 
    339            filtered_results.append((language, max(index_results[language]))) 
    340 
    341        return filtered_results 
    342 
    343    return results 
    344 
    345 
    346@lru_cache(maxsize=2048) 
    347def coherence_ratio( 
    348    decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None 
    349) -> CoherenceMatches: 
    350    """ 
    351    Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. 
    352    A layer = Character extraction by alphabets/ranges. 
    353    """ 
    354 
    355    results: list[tuple[str, float]] = [] 
    356    ignore_non_latin: bool = False 
    357 
    358    sufficient_match_count: int = 0 
    359 
    360    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] 
    361    if "Latin Based" in lg_inclusion_list: 
    362        ignore_non_latin = True 
    363        lg_inclusion_list.remove("Latin Based") 
    364 
    365    for layer in alpha_unicode_split(decoded_sequence): 
    366        sequence_frequencies: TypeCounter[str] = Counter(layer) 
    367        most_common = sequence_frequencies.most_common() 
    368 
    369        character_count: int = sum(o for c, o in most_common) 
    370 
    371        if character_count <= TOO_SMALL_SEQUENCE: 
    372            continue 
    373 
    374        popular_character_ordered: list[str] = [c for c, o in most_common] 
    375 
    376        for language in lg_inclusion_list or alphabet_languages( 
    377            popular_character_ordered, ignore_non_latin 
    378        ): 
    379            ratio: float = characters_popularity_compare( 
    380                language, popular_character_ordered 
    381            ) 
    382 
    383            if ratio < threshold: 
    384                continue 
    385            elif ratio >= 0.8: 
    386                sufficient_match_count += 1 
    387 
    388            results.append((language, round(ratio, 4))) 
    389 
    390            if sufficient_match_count >= 3: 
    391                break 
    392 
    393    return sorted( 
    394        filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True 
    395    )