1from __future__ import annotations 
    2 
    3from typing import TYPE_CHECKING, Any 
    4from warnings import warn 
    5 
    6from .api import from_bytes 
    7from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE 
    8 
    9# TODO: remove this check when dropping Python 3.7 support 
    10if TYPE_CHECKING: 
    11    from typing_extensions import TypedDict 
    12 
    13    class ResultDict(TypedDict): 
    14        encoding: str | None 
    15        language: str 
    16        confidence: float | None 
    17 
    18 
    19def detect( 
    20    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any 
    21) -> ResultDict: 
    22    """ 
    23    chardet legacy method 
    24    Detect the encoding of the given byte string. It should be mostly backward-compatible. 
    25    Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) 
    26    This function is deprecated and should be used to migrate your project easily, consult the documentation for 
    27    further information. Not planned for removal. 
    28 
    29    :param byte_str:     The byte sequence to examine. 
    30    :param should_rename_legacy:  Should we rename legacy encodings 
    31                                  to their more modern equivalents? 
    32    """ 
    33    if len(kwargs): 
    34        warn( 
    35            f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" 
    36        ) 
    37 
    38    if not isinstance(byte_str, (bytearray, bytes)): 
    39        raise TypeError(  # pragma: nocover 
    40            f"Expected object of type bytes or bytearray, got: {type(byte_str)}" 
    41        ) 
    42 
    43    if isinstance(byte_str, bytearray): 
    44        byte_str = bytes(byte_str) 
    45 
    46    r = from_bytes(byte_str).best() 
    47 
    48    encoding = r.encoding if r is not None else None 
    49    language = r.language if r is not None and r.language != "Unknown" else "" 
    50    confidence = 1.0 - r.chaos if r is not None else None 
    51 
    52    # automatically lower confidence 
    53    # on small bytes samples. 
    54    # https://github.com/jawah/charset_normalizer/issues/391 
    55    if ( 
    56        confidence is not None 
    57        and confidence >= 0.9 
    58        and encoding 
    59        not in { 
    60            "utf_8", 
    61            "ascii", 
    62        } 
    63        and r.bom is False  # type: ignore[union-attr] 
    64        and len(byte_str) < TOO_SMALL_SEQUENCE 
    65    ): 
    66        confidence -= 0.2 
    67 
    68    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process 
    69    # but chardet does return 'utf-8-sig' and it is a valid codec name. 
    70    if r is not None and encoding == "utf_8" and r.bom: 
    71        encoding += "_sig" 
    72 
    73    if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: 
    74        encoding = CHARDET_CORRESPONDENCE[encoding] 
    75 
    76    return { 
    77        "encoding": encoding, 
    78        "language": language, 
    79        "confidence": confidence, 
    80    }