Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init__.py: 57%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

49 statements  

1"""Universal character encoding detector — 0BSD-licensed rewrite.""" 

2 

3from __future__ import annotations 

4 

5from collections.abc import Iterable 

6 

7from chardet._utils import ( 

8 _DEFAULT_CHUNK_SIZE, 

9 DEFAULT_MAX_BYTES, 

10 MINIMUM_THRESHOLD, 

11 _resolve_prefer_superset, 

12 _validate_max_bytes, 

13 _warn_deprecated_chunk_size, 

14) 

15from chardet._version import __version__ 

16from chardet.detector import UniversalDetector 

17from chardet.enums import EncodingEra, LanguageFilter 

18from chardet.equivalences import apply_compat_names, apply_preferred_superset 

19from chardet.pipeline import DetectionDict, DetectionResult 

20from chardet.pipeline.orchestrator import run_pipeline 

21from chardet.registry import _validate_encoding, normalize_encodings 

22 

23__all__ = [ 

24 "DEFAULT_MAX_BYTES", 

25 "MINIMUM_THRESHOLD", 

26 "DetectionDict", 

27 "DetectionResult", 

28 "EncodingEra", 

29 "LanguageFilter", 

30 "UniversalDetector", 

31 "__version__", 

32 "detect", 

33 "detect_all", 

34] 

35 

36 

37def detect( # noqa: PLR0913 

38 byte_str: bytes | bytearray, 

39 should_rename_legacy: bool = False, 

40 encoding_era: EncodingEra = EncodingEra.ALL, 

41 chunk_size: int = _DEFAULT_CHUNK_SIZE, 

42 max_bytes: int = DEFAULT_MAX_BYTES, 

43 *, 

44 prefer_superset: bool = False, 

45 compat_names: bool = True, 

46 include_encodings: Iterable[str] | None = None, 

47 exclude_encodings: Iterable[str] | None = None, 

48 no_match_encoding: str = "cp1252", 

49 empty_input_encoding: str = "utf-8", 

50) -> DetectionDict: 

51 """Detect the encoding of the given byte string. 

52 

53 :param byte_str: The byte sequence to detect encoding for. 

54 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

55 :param encoding_era: Restrict candidate encodings to the given era. 

56 :param chunk_size: Deprecated -- accepted for backward compatibility but 

57 has no effect. 

58 :param max_bytes: Maximum number of bytes to examine from *byte_str*. 

59 :param prefer_superset: If ``True``, remap ISO subset encodings to their 

60 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252). 

61 :param compat_names: If ``True`` (default), return encoding names 

62 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

63 codec names. 

64 :param include_encodings: If given, restrict detection to only these 

65 encodings (names or aliases). 

66 :param exclude_encodings: If given, remove these encodings from the 

67 candidate set. 

68 :param no_match_encoding: Encoding to return when no candidate survives 

69 the pipeline. Defaults to ``"cp1252"``. 

70 :param empty_input_encoding: Encoding to return for empty input. Defaults 

71 to ``"utf-8"``. 

72 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and 

73 ``"language"``. 

74 """ 

75 _warn_deprecated_chunk_size(chunk_size) 

76 _validate_max_bytes(max_bytes) 

77 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset) 

78 include = normalize_encodings(include_encodings, "include_encodings") 

79 exclude = normalize_encodings(exclude_encodings, "exclude_encodings") 

80 no_match = _validate_encoding(no_match_encoding, "no_match_encoding") 

81 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding") 

82 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str) 

83 results = run_pipeline( 

84 data, 

85 encoding_era, 

86 max_bytes=max_bytes, 

87 include_encodings=include, 

88 exclude_encodings=exclude, 

89 no_match_encoding=no_match, 

90 empty_input_encoding=empty, 

91 ) 

92 result = results[0].to_dict() 

93 if prefer_superset: 

94 apply_preferred_superset(result) 

95 if compat_names: 

96 apply_compat_names(result) 

97 return result 

98 

99 

100def detect_all( # noqa: PLR0913 

101 byte_str: bytes | bytearray, 

102 ignore_threshold: bool = False, 

103 should_rename_legacy: bool = False, 

104 encoding_era: EncodingEra = EncodingEra.ALL, 

105 chunk_size: int = _DEFAULT_CHUNK_SIZE, 

106 max_bytes: int = DEFAULT_MAX_BYTES, 

107 *, 

108 prefer_superset: bool = False, 

109 compat_names: bool = True, 

110 include_encodings: Iterable[str] | None = None, 

111 exclude_encodings: Iterable[str] | None = None, 

112 no_match_encoding: str = "cp1252", 

113 empty_input_encoding: str = "utf-8", 

114) -> list[DetectionDict]: 

115 """Detect all possible encodings of the given byte string. 

116 

117 When *ignore_threshold* is False (the default), results with confidence 

118 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below 

119 the threshold, the full unfiltered list is returned as a fallback so the 

120 caller always receives at least one result. 

121 

122 :param byte_str: The byte sequence to detect encoding for. 

123 :param ignore_threshold: If ``True``, return all candidate encodings 

124 regardless of confidence score. 

125 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

126 :param encoding_era: Restrict candidate encodings to the given era. 

127 :param chunk_size: Deprecated -- accepted for backward compatibility but 

128 has no effect. 

129 :param max_bytes: Maximum number of bytes to examine from *byte_str*. 

130 :param prefer_superset: If ``True``, remap ISO subset encodings to their 

131 Windows/CP superset equivalents. 

132 :param compat_names: If ``True`` (default), return encoding names 

133 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

134 codec names. 

135 :param include_encodings: If given, restrict detection to only these 

136 encodings (names or aliases). 

137 :param exclude_encodings: If given, remove these encodings from the 

138 candidate set. 

139 :param no_match_encoding: Encoding to return when no candidate survives 

140 the pipeline. Defaults to ``"cp1252"``. 

141 :param empty_input_encoding: Encoding to return for empty input. Defaults 

142 to ``"utf-8"``. 

143 :returns: A list of dictionaries, sorted by descending confidence. 

144 """ 

145 _warn_deprecated_chunk_size(chunk_size) 

146 _validate_max_bytes(max_bytes) 

147 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset) 

148 include = normalize_encodings(include_encodings, "include_encodings") 

149 exclude = normalize_encodings(exclude_encodings, "exclude_encodings") 

150 no_match = _validate_encoding(no_match_encoding, "no_match_encoding") 

151 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding") 

152 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str) 

153 results = run_pipeline( 

154 data, 

155 encoding_era, 

156 max_bytes=max_bytes, 

157 include_encodings=include, 

158 exclude_encodings=exclude, 

159 no_match_encoding=no_match, 

160 empty_input_encoding=empty, 

161 ) 

162 dicts = [r.to_dict() for r in results] 

163 if not ignore_threshold: 

164 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD] 

165 if filtered: 

166 dicts = filtered 

167 for d in dicts: 

168 if prefer_superset: 

169 apply_preferred_superset(d) 

170 if compat_names: 

171 apply_compat_names(d) 

172 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)