Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init__.py: 56%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

39 statements  

1"""Universal character encoding detector — MIT-licensed rewrite.""" 

2 

3from __future__ import annotations 

4 

5from chardet._utils import ( 

6 _DEFAULT_CHUNK_SIZE, 

7 DEFAULT_MAX_BYTES, 

8 MINIMUM_THRESHOLD, 

9 _resolve_prefer_superset, 

10 _validate_max_bytes, 

11 _warn_deprecated_chunk_size, 

12) 

13from chardet._version import __version__ 

14from chardet.detector import UniversalDetector 

15from chardet.enums import EncodingEra, LanguageFilter 

16from chardet.equivalences import apply_compat_names, apply_preferred_superset 

17from chardet.pipeline import DetectionDict, DetectionResult 

18from chardet.pipeline.orchestrator import run_pipeline 

19 

20__all__ = [ 

21 "DEFAULT_MAX_BYTES", 

22 "MINIMUM_THRESHOLD", 

23 "DetectionDict", 

24 "DetectionResult", 

25 "EncodingEra", 

26 "LanguageFilter", 

27 "UniversalDetector", 

28 "__version__", 

29 "detect", 

30 "detect_all", 

31] 

32 

33 

34def detect( # noqa: PLR0913 

35 byte_str: bytes | bytearray, 

36 should_rename_legacy: bool = False, 

37 encoding_era: EncodingEra = EncodingEra.ALL, 

38 chunk_size: int = _DEFAULT_CHUNK_SIZE, 

39 max_bytes: int = DEFAULT_MAX_BYTES, 

40 *, 

41 prefer_superset: bool = False, 

42 compat_names: bool = True, 

43) -> DetectionDict: 

44 """Detect the encoding of the given byte string. 

45 

46 :param byte_str: The byte sequence to detect encoding for. 

47 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

48 :param encoding_era: Restrict candidate encodings to the given era. 

49 :param chunk_size: Deprecated -- accepted for backward compatibility but 

50 has no effect. 

51 :param max_bytes: Maximum number of bytes to examine from *byte_str*. 

52 :param prefer_superset: If ``True``, remap ISO subset encodings to their 

53 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252). 

54 :param compat_names: If ``True`` (default), return encoding names 

55 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

56 codec names. 

57 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and 

58 ``"language"``. 

59 """ 

60 _warn_deprecated_chunk_size(chunk_size) 

61 _validate_max_bytes(max_bytes) 

62 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset) 

63 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str) 

64 results = run_pipeline(data, encoding_era, max_bytes=max_bytes) 

65 result = results[0].to_dict() 

66 if prefer_superset: 

67 apply_preferred_superset(result) 

68 if compat_names: 

69 apply_compat_names(result) 

70 return result 

71 

72 

73def detect_all( # noqa: PLR0913 

74 byte_str: bytes | bytearray, 

75 ignore_threshold: bool = False, 

76 should_rename_legacy: bool = False, 

77 encoding_era: EncodingEra = EncodingEra.ALL, 

78 chunk_size: int = _DEFAULT_CHUNK_SIZE, 

79 max_bytes: int = DEFAULT_MAX_BYTES, 

80 *, 

81 prefer_superset: bool = False, 

82 compat_names: bool = True, 

83) -> list[DetectionDict]: 

84 """Detect all possible encodings of the given byte string. 

85 

86 When *ignore_threshold* is False (the default), results with confidence 

87 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below 

88 the threshold, the full unfiltered list is returned as a fallback so the 

89 caller always receives at least one result. 

90 

91 :param byte_str: The byte sequence to detect encoding for. 

92 :param ignore_threshold: If ``True``, return all candidate encodings 

93 regardless of confidence score. 

94 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

95 :param encoding_era: Restrict candidate encodings to the given era. 

96 :param chunk_size: Deprecated -- accepted for backward compatibility but 

97 has no effect. 

98 :param max_bytes: Maximum number of bytes to examine from *byte_str*. 

99 :param prefer_superset: If ``True``, remap ISO subset encodings to their 

100 Windows/CP superset equivalents. 

101 :param compat_names: If ``True`` (default), return encoding names 

102 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

103 codec names. 

104 :returns: A list of dictionaries, sorted by descending confidence. 

105 """ 

106 _warn_deprecated_chunk_size(chunk_size) 

107 _validate_max_bytes(max_bytes) 

108 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset) 

109 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str) 

110 results = run_pipeline(data, encoding_era, max_bytes=max_bytes) 

111 dicts = [r.to_dict() for r in results] 

112 if not ignore_threshold: 

113 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD] 

114 if filtered: 

115 dicts = filtered 

116 for d in dicts: 

117 if prefer_superset: 

118 apply_preferred_superset(d) 

119 if compat_names: 

120 apply_compat_names(d) 

121 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)