1from __future__ import annotations 
    2 
    3from encodings.aliases import aliases 
    4from hashlib import sha256 
    5from json import dumps 
    6from re import sub 
    7from typing import Any, Iterator, List, Tuple 
    8 
    9from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE 
    10from .utils import iana_name, is_multi_byte_encoding, unicode_range 
    11 
    12 
    13class CharsetMatch: 
    14    def __init__( 
    15        self, 
    16        payload: bytes, 
    17        guessed_encoding: str, 
    18        mean_mess_ratio: float, 
    19        has_sig_or_bom: bool, 
    20        languages: CoherenceMatches, 
    21        decoded_payload: str | None = None, 
    22        preemptive_declaration: str | None = None, 
    23    ): 
    24        self._payload: bytes = payload 
    25 
    26        self._encoding: str = guessed_encoding 
    27        self._mean_mess_ratio: float = mean_mess_ratio 
    28        self._languages: CoherenceMatches = languages 
    29        self._has_sig_or_bom: bool = has_sig_or_bom 
    30        self._unicode_ranges: list[str] | None = None 
    31 
    32        self._leaves: list[CharsetMatch] = [] 
    33        self._mean_coherence_ratio: float = 0.0 
    34 
    35        self._output_payload: bytes | None = None 
    36        self._output_encoding: str | None = None 
    37 
    38        self._string: str | None = decoded_payload 
    39 
    40        self._preemptive_declaration: str | None = preemptive_declaration 
    41 
    42    def __eq__(self, other: object) -> bool: 
    43        if not isinstance(other, CharsetMatch): 
    44            if isinstance(other, str): 
    45                return iana_name(other) == self.encoding 
    46            return False 
    47        return self.encoding == other.encoding and self.fingerprint == other.fingerprint 
    48 
    49    def __lt__(self, other: object) -> bool: 
    50        """ 
    51        Implemented to make sorted available upon CharsetMatches items. 
    52        """ 
    53        if not isinstance(other, CharsetMatch): 
    54            raise ValueError 
    55 
    56        chaos_difference: float = abs(self.chaos - other.chaos) 
    57        coherence_difference: float = abs(self.coherence - other.coherence) 
    58 
    59        # Below 1% difference --> Use Coherence 
    60        if chaos_difference < 0.01 and coherence_difference > 0.02: 
    61            return self.coherence > other.coherence 
    62        elif chaos_difference < 0.01 and coherence_difference <= 0.02: 
    63            # When having a difficult decision, use the result that decoded as many multi-byte as possible. 
    64            # preserve RAM usage! 
    65            if len(self._payload) >= TOO_BIG_SEQUENCE: 
    66                return self.chaos < other.chaos 
    67            return self.multi_byte_usage > other.multi_byte_usage 
    68 
    69        return self.chaos < other.chaos 
    70 
    71    @property 
    72    def multi_byte_usage(self) -> float: 
    73        return 1.0 - (len(str(self)) / len(self.raw)) 
    74 
    75    def __str__(self) -> str: 
    76        # Lazy Str Loading 
    77        if self._string is None: 
    78            self._string = str(self._payload, self._encoding, "strict") 
    79        return self._string 
    80 
    81    def __repr__(self) -> str: 
    82        return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>" 
    83 
    84    def add_submatch(self, other: CharsetMatch) -> None: 
    85        if not isinstance(other, CharsetMatch) or other == self: 
    86            raise ValueError( 
    87                "Unable to add instance <{}> as a submatch of a CharsetMatch".format( 
    88                    other.__class__ 
    89                ) 
    90            ) 
    91 
    92        other._string = None  # Unload RAM usage; dirty trick. 
    93        self._leaves.append(other) 
    94 
    95    @property 
    96    def encoding(self) -> str: 
    97        return self._encoding 
    98 
    99    @property 
    100    def encoding_aliases(self) -> list[str]: 
    101        """ 
    102        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. 
    103        """ 
    104        also_known_as: list[str] = [] 
    105        for u, p in aliases.items(): 
    106            if self.encoding == u: 
    107                also_known_as.append(p) 
    108            elif self.encoding == p: 
    109                also_known_as.append(u) 
    110        return also_known_as 
    111 
    112    @property 
    113    def bom(self) -> bool: 
    114        return self._has_sig_or_bom 
    115 
    116    @property 
    117    def byte_order_mark(self) -> bool: 
    118        return self._has_sig_or_bom 
    119 
    120    @property 
    121    def languages(self) -> list[str]: 
    122        """ 
    123        Return the complete list of possible languages found in decoded sequence. 
    124        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. 
    125        """ 
    126        return [e[0] for e in self._languages] 
    127 
    128    @property 
    129    def language(self) -> str: 
    130        """ 
    131        Most probable language found in decoded sequence. If none were detected or inferred, the property will return 
    132        "Unknown". 
    133        """ 
    134        if not self._languages: 
    135            # Trying to infer the language based on the given encoding 
    136            # Its either English or we should not pronounce ourselves in certain cases. 
    137            if "ascii" in self.could_be_from_charset: 
    138                return "English" 
    139 
    140            # doing it there to avoid circular import 
    141            from charset_normalizer.cd import encoding_languages, mb_encoding_languages 
    142 
    143            languages = ( 
    144                mb_encoding_languages(self.encoding) 
    145                if is_multi_byte_encoding(self.encoding) 
    146                else encoding_languages(self.encoding) 
    147            ) 
    148 
    149            if len(languages) == 0 or "Latin Based" in languages: 
    150                return "Unknown" 
    151 
    152            return languages[0] 
    153 
    154        return self._languages[0][0] 
    155 
    156    @property 
    157    def chaos(self) -> float: 
    158        return self._mean_mess_ratio 
    159 
    160    @property 
    161    def coherence(self) -> float: 
    162        if not self._languages: 
    163            return 0.0 
    164        return self._languages[0][1] 
    165 
    166    @property 
    167    def percent_chaos(self) -> float: 
    168        return round(self.chaos * 100, ndigits=3) 
    169 
    170    @property 
    171    def percent_coherence(self) -> float: 
    172        return round(self.coherence * 100, ndigits=3) 
    173 
    174    @property 
    175    def raw(self) -> bytes: 
    176        """ 
    177        Original untouched bytes. 
    178        """ 
    179        return self._payload 
    180 
    181    @property 
    182    def submatch(self) -> list[CharsetMatch]: 
    183        return self._leaves 
    184 
    185    @property 
    186    def has_submatch(self) -> bool: 
    187        return len(self._leaves) > 0 
    188 
    189    @property 
    190    def alphabets(self) -> list[str]: 
    191        if self._unicode_ranges is not None: 
    192            return self._unicode_ranges 
    193        # list detected ranges 
    194        detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] 
    195        # filter and sort 
    196        self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) 
    197        return self._unicode_ranges 
    198 
    199    @property 
    200    def could_be_from_charset(self) -> list[str]: 
    201        """ 
    202        The complete list of encoding that output the exact SAME str result and therefore could be the originating 
    203        encoding. 
    204        This list does include the encoding available in property 'encoding'. 
    205        """ 
    206        return [self._encoding] + [m.encoding for m in self._leaves] 
    207 
    208    def output(self, encoding: str = "utf_8") -> bytes: 
    209        """ 
    210        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. 
    211        Any errors will be simply ignored by the encoder NOT replaced. 
    212        """ 
    213        if self._output_encoding is None or self._output_encoding != encoding: 
    214            self._output_encoding = encoding 
    215            decoded_string = str(self) 
    216            if ( 
    217                self._preemptive_declaration is not None 
    218                and self._preemptive_declaration.lower() 
    219                not in ["utf-8", "utf8", "utf_8"] 
    220            ): 
    221                patched_header = sub( 
    222                    RE_POSSIBLE_ENCODING_INDICATION, 
    223                    lambda m: m.string[m.span()[0] : m.span()[1]].replace( 
    224                        m.groups()[0], 
    225                        iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type] 
    226                    ), 
    227                    decoded_string[:8192], 
    228                    count=1, 
    229                ) 
    230 
    231                decoded_string = patched_header + decoded_string[8192:] 
    232 
    233            self._output_payload = decoded_string.encode(encoding, "replace") 
    234 
    235        return self._output_payload  # type: ignore 
    236 
    237    @property 
    238    def fingerprint(self) -> str: 
    239        """ 
    240        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. 
    241        """ 
    242        return sha256(self.output()).hexdigest() 
    243 
    244 
    245class CharsetMatches: 
    246    """ 
    247    Container with every CharsetMatch items ordered by default from most probable to the less one. 
    248    Act like a list(iterable) but does not implements all related methods. 
    249    """ 
    250 
    251    def __init__(self, results: list[CharsetMatch] | None = None): 
    252        self._results: list[CharsetMatch] = sorted(results) if results else [] 
    253 
    254    def __iter__(self) -> Iterator[CharsetMatch]: 
    255        yield from self._results 
    256 
    257    def __getitem__(self, item: int | str) -> CharsetMatch: 
    258        """ 
    259        Retrieve a single item either by its position or encoding name (alias may be used here). 
    260        Raise KeyError upon invalid index or encoding not present in results. 
    261        """ 
    262        if isinstance(item, int): 
    263            return self._results[item] 
    264        if isinstance(item, str): 
    265            item = iana_name(item, False) 
    266            for result in self._results: 
    267                if item in result.could_be_from_charset: 
    268                    return result 
    269        raise KeyError 
    270 
    271    def __len__(self) -> int: 
    272        return len(self._results) 
    273 
    274    def __bool__(self) -> bool: 
    275        return len(self._results) > 0 
    276 
    277    def append(self, item: CharsetMatch) -> None: 
    278        """ 
    279        Insert a single match. Will be inserted accordingly to preserve sort. 
    280        Can be inserted as a submatch. 
    281        """ 
    282        if not isinstance(item, CharsetMatch): 
    283            raise ValueError( 
    284                "Cannot append instance '{}' to CharsetMatches".format( 
    285                    str(item.__class__) 
    286                ) 
    287            ) 
    288        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) 
    289        if len(item.raw) < TOO_BIG_SEQUENCE: 
    290            for match in self._results: 
    291                if match.fingerprint == item.fingerprint and match.chaos == item.chaos: 
    292                    match.add_submatch(item) 
    293                    return 
    294        self._results.append(item) 
    295        self._results = sorted(self._results) 
    296 
    297    def best(self) -> CharsetMatch | None: 
    298        """ 
    299        Simply return the first match. Strict equivalent to matches[0]. 
    300        """ 
    301        if not self._results: 
    302            return None 
    303        return self._results[0] 
    304 
    305    def first(self) -> CharsetMatch | None: 
    306        """ 
    307        Redundant method, call the method best(). Kept for BC reasons. 
    308        """ 
    309        return self.best() 
    310 
    311 
    312CoherenceMatch = Tuple[str, float] 
    313CoherenceMatches = List[CoherenceMatch] 
    314 
    315 
    316class CliDetectionResult: 
    317    def __init__( 
    318        self, 
    319        path: str, 
    320        encoding: str | None, 
    321        encoding_aliases: list[str], 
    322        alternative_encodings: list[str], 
    323        language: str, 
    324        alphabets: list[str], 
    325        has_sig_or_bom: bool, 
    326        chaos: float, 
    327        coherence: float, 
    328        unicode_path: str | None, 
    329        is_preferred: bool, 
    330    ): 
    331        self.path: str = path 
    332        self.unicode_path: str | None = unicode_path 
    333        self.encoding: str | None = encoding 
    334        self.encoding_aliases: list[str] = encoding_aliases 
    335        self.alternative_encodings: list[str] = alternative_encodings 
    336        self.language: str = language 
    337        self.alphabets: list[str] = alphabets 
    338        self.has_sig_or_bom: bool = has_sig_or_bom 
    339        self.chaos: float = chaos 
    340        self.coherence: float = coherence 
    341        self.is_preferred: bool = is_preferred 
    342 
    343    @property 
    344    def __dict__(self) -> dict[str, Any]:  # type: ignore 
    345        return { 
    346            "path": self.path, 
    347            "encoding": self.encoding, 
    348            "encoding_aliases": self.encoding_aliases, 
    349            "alternative_encodings": self.alternative_encodings, 
    350            "language": self.language, 
    351            "alphabets": self.alphabets, 
    352            "has_sig_or_bom": self.has_sig_or_bom, 
    353            "chaos": self.chaos, 
    354            "coherence": self.coherence, 
    355            "unicode_path": self.unicode_path, 
    356            "is_preferred": self.is_preferred, 
    357        } 
    358 
    359    def to_json(self) -> str: 
    360        return dumps(self.__dict__, ensure_ascii=True, indent=4)