1from __future__ import annotations
2
3from typing import TYPE_CHECKING, Any
4from warnings import warn
5
6from .api import from_bytes
7from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
8
9if TYPE_CHECKING:
10 from typing import TypedDict
11
12 class ResultDict(TypedDict):
13 encoding: str | None
14 language: str
15 confidence: float | None
16
17
18def detect(
19 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
20) -> ResultDict:
21 """
22 chardet legacy method
23 Detect the encoding of the given byte string. It should be mostly backward-compatible.
24 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
25 This function is deprecated and should be used to migrate your project easily, consult the documentation for
26 further information. Not planned for removal.
27
28 :param byte_str: The byte sequence to examine.
29 :param should_rename_legacy: Should we rename legacy encodings
30 to their more modern equivalents?
31 """
32 if len(kwargs):
33 warn(
34 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
35 )
36
37 if not isinstance(byte_str, (bytearray, bytes)):
38 raise TypeError( # pragma: nocover
39 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
40 )
41
42 if isinstance(byte_str, bytearray):
43 byte_str = bytes(byte_str)
44
45 r = from_bytes(byte_str).best()
46
47 encoding = r.encoding if r is not None else None
48 language = r.language if r is not None and r.language != "Unknown" else ""
49 confidence = 1.0 - r.chaos if r is not None else None
50
51 # automatically lower confidence
52 # on small bytes samples.
53 # https://github.com/jawah/charset_normalizer/issues/391
54 if (
55 confidence is not None
56 and confidence >= 0.9
57 and encoding
58 not in {
59 "utf_8",
60 "ascii",
61 }
62 and r.bom is False # type: ignore[union-attr]
63 and len(byte_str) < TOO_SMALL_SEQUENCE
64 ):
65 confidence -= 0.2
66
67 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
68 # but chardet does return 'utf-8-sig' and it is a valid codec name.
69 if r is not None and encoding == "utf_8" and r.bom:
70 encoding += "_sig"
71
72 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
73 encoding = CHARDET_CORRESPONDENCE[encoding]
74
75 return {
76 "encoding": encoding,
77 "language": language,
78 "confidence": confidence,
79 }