Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/legacy.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

28 statements  

1from __future__ import annotations 

2 

3from typing import TYPE_CHECKING, Any 

4from warnings import warn 

5 

6from .api import from_bytes 

7from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE 

8 

9# TODO: remove this check when dropping Python 3.7 support 

10if TYPE_CHECKING: 

11 from typing_extensions import TypedDict 

12 

13 class ResultDict(TypedDict): 

14 encoding: str | None 

15 language: str 

16 confidence: float | None 

17 

18 

19def detect( 

20 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any 

21) -> ResultDict: 

22 """ 

23 chardet legacy method 

24 Detect the encoding of the given byte string. It should be mostly backward-compatible. 

25 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) 

26 This function is deprecated and should be used to migrate your project easily, consult the documentation for 

27 further information. Not planned for removal. 

28 

29 :param byte_str: The byte sequence to examine. 

30 :param should_rename_legacy: Should we rename legacy encodings 

31 to their more modern equivalents? 

32 """ 

33 if len(kwargs): 

34 warn( 

35 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" 

36 ) 

37 

38 if not isinstance(byte_str, (bytearray, bytes)): 

39 raise TypeError( # pragma: nocover 

40 f"Expected object of type bytes or bytearray, got: {type(byte_str)}" 

41 ) 

42 

43 if isinstance(byte_str, bytearray): 

44 byte_str = bytes(byte_str) 

45 

46 r = from_bytes(byte_str).best() 

47 

48 encoding = r.encoding if r is not None else None 

49 language = r.language if r is not None and r.language != "Unknown" else "" 

50 confidence = 1.0 - r.chaos if r is not None else None 

51 

52 # automatically lower confidence 

53 # on small bytes samples. 

54 # https://github.com/jawah/charset_normalizer/issues/391 

55 if ( 

56 confidence is not None 

57 and confidence >= 0.9 

58 and encoding 

59 not in { 

60 "utf_8", 

61 "ascii", 

62 } 

63 and r.bom is False # type: ignore[union-attr] 

64 and len(byte_str) < TOO_SMALL_SEQUENCE 

65 ): 

66 confidence -= 0.2 

67 

68 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process 

69 # but chardet does return 'utf-8-sig' and it is a valid codec name. 

70 if r is not None and encoding == "utf_8" and r.bom: 

71 encoding += "_sig" 

72 

73 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: 

74 encoding = CHARDET_CORRESPONDENCE[encoding] 

75 

76 return { 

77 "encoding": encoding, 

78 "language": language, 

79 "confidence": confidence, 

80 }