Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/detector.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

70 statements  

1"""UniversalDetector — streaming encoding detection.""" 

2 

3from __future__ import annotations 

4 

5import warnings 

6from collections.abc import Iterable 

7from types import MappingProxyType 

8from typing import ClassVar 

9 

10from chardet import _utils 

11from chardet._utils import ( 

12 DEFAULT_MAX_BYTES, 

13 _resolve_prefer_superset, 

14 _validate_max_bytes, 

15) 

16from chardet.enums import EncodingEra, LanguageFilter 

17from chardet.equivalences import ( 

18 PREFERRED_SUPERSET, 

19 apply_compat_names, 

20 apply_preferred_superset, 

21) 

22from chardet.pipeline import _NONE_RESULT, DetectionDict, DetectionResult 

23from chardet.pipeline.orchestrator import run_pipeline 

24from chardet.registry import _validate_encoding, normalize_encodings 

25 

26 

27class UniversalDetector: 

28 """Streaming character encoding detector. 

29 

30 Implements a feed/close pattern for incremental detection of character 

31 encoding from byte streams. Compatible with the chardet 6.x API. 

32 

33 All detection is performed by the same pipeline used by 

34 :func:`chardet.detect` and :func:`chardet.detect_all`, ensuring 

35 consistent results regardless of which API is used. 

36 

37 .. note:: 

38 

39 This class is **not** thread-safe. Each thread should create its own 

40 :class:`UniversalDetector` instance. 

41 """ 

42 

43 MINIMUM_THRESHOLD = _utils.MINIMUM_THRESHOLD 

44 # Exposed for backward compatibility with chardet 6.x callers that 

45 # reference UniversalDetector.LEGACY_MAP directly. 

46 LEGACY_MAP: ClassVar[MappingProxyType[str, str]] = MappingProxyType( 

47 PREFERRED_SUPERSET 

48 ) 

49 

50 def __init__( # noqa: PLR0913 

51 self, 

52 lang_filter: LanguageFilter = LanguageFilter.ALL, 

53 should_rename_legacy: bool = False, 

54 encoding_era: EncodingEra = EncodingEra.ALL, 

55 max_bytes: int = DEFAULT_MAX_BYTES, 

56 *, 

57 prefer_superset: bool = False, 

58 compat_names: bool = True, 

59 include_encodings: Iterable[str] | None = None, 

60 exclude_encodings: Iterable[str] | None = None, 

61 no_match_encoding: str = "cp1252", 

62 empty_input_encoding: str = "utf-8", 

63 ) -> None: 

64 """Initialize the detector. 

65 

66 :param lang_filter: Deprecated -- accepted for backward compatibility 

67 but has no effect. A warning is emitted when set to anything 

68 other than :attr:`LanguageFilter.ALL`. 

69 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

70 :param encoding_era: Restrict candidate encodings to the given era. 

71 :param max_bytes: Maximum number of bytes to buffer from 

72 :meth:`feed` calls before stopping accumulation. 

73 :param prefer_superset: If ``True``, remap ISO subset encodings to 

74 their Windows/CP superset equivalents (e.g., ISO-8859-1 -> 

75 Windows-1252). 

76 :param compat_names: If ``True`` (default), return encoding names 

77 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

78 codec names. 

79 :param include_encodings: If given, restrict detection to only these 

80 encodings (names or aliases). 

81 :param exclude_encodings: If given, remove these encodings from the 

82 candidate set. 

83 :param no_match_encoding: Encoding to return when no candidate 

84 survives the pipeline. Defaults to ``"cp1252"``. 

85 :param empty_input_encoding: Encoding to return for empty input. 

86 Defaults to ``"utf-8"``. 

87 """ 

88 if lang_filter != LanguageFilter.ALL: 

89 warnings.warn( 

90 "lang_filter is not implemented in this version of chardet " 

91 "and will be ignored", 

92 DeprecationWarning, 

93 stacklevel=2, 

94 ) 

95 prefer_superset = _resolve_prefer_superset( 

96 should_rename_legacy, prefer_superset 

97 ) 

98 self._prefer_superset = prefer_superset 

99 self._compat_names = compat_names 

100 _validate_max_bytes(max_bytes) 

101 self._encoding_era = encoding_era 

102 self._max_bytes = max_bytes 

103 self._include_encodings = normalize_encodings( 

104 include_encodings, "include_encodings" 

105 ) 

106 self._exclude_encodings = normalize_encodings( 

107 exclude_encodings, "exclude_encodings" 

108 ) 

109 self._no_match_encoding = _validate_encoding( 

110 no_match_encoding, "no_match_encoding" 

111 ) 

112 self._empty_input_encoding = _validate_encoding( 

113 empty_input_encoding, "empty_input_encoding" 

114 ) 

115 self._buffer = bytearray() 

116 self._done = False 

117 self._closed = False 

118 self._result: DetectionResult | None = None 

119 

120 def feed(self, byte_str: bytes | bytearray) -> None: 

121 """Feed a chunk of bytes to the detector. 

122 

123 Data is accumulated in an internal buffer. Once *max_bytes* have 

124 been buffered, :attr:`done` is set to ``True`` and further data is 

125 ignored until :meth:`reset` is called. 

126 

127 :param byte_str: The next chunk of bytes to examine. 

128 :raises ValueError: If called after :meth:`close` without a 

129 :meth:`reset`. 

130 """ 

131 if self._closed: 

132 msg = "feed() called after close() without reset()" 

133 raise ValueError(msg) 

134 if self._done: 

135 return 

136 remaining = self._max_bytes - len(self._buffer) 

137 if remaining > 0: 

138 self._buffer.extend(byte_str[:remaining]) 

139 if len(self._buffer) >= self._max_bytes: 

140 self._done = True 

141 

142 def close(self) -> DetectionDict: 

143 """Finalize detection and return the best result. 

144 

145 Runs the full detection pipeline on the buffered data. 

146 

147 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, 

148 and ``"language"``. 

149 """ 

150 if not self._closed: 

151 self._closed = True 

152 data = bytes(self._buffer) 

153 results = run_pipeline( 

154 data, 

155 self._encoding_era, 

156 max_bytes=self._max_bytes, 

157 include_encodings=self._include_encodings, 

158 exclude_encodings=self._exclude_encodings, 

159 no_match_encoding=self._no_match_encoding, 

160 empty_input_encoding=self._empty_input_encoding, 

161 ) 

162 self._result = results[0] 

163 self._done = True 

164 return self.result 

165 

166 def reset(self) -> None: 

167 """Reset the detector to its initial state for reuse.""" 

168 self._buffer = bytearray() 

169 self._done = False 

170 self._closed = False 

171 self._result = None 

172 

173 @property 

174 def done(self) -> bool: 

175 """Whether detection is complete and no more data is needed.""" 

176 return self._done 

177 

178 @property 

179 def result(self) -> DetectionDict: 

180 """The current best detection result.""" 

181 if self._result is not None: 

182 d = self._result.to_dict() 

183 if self._prefer_superset: 

184 apply_preferred_superset(d) 

185 if self._compat_names: 

186 apply_compat_names(d) 

187 return d 

188 return _NONE_RESULT.to_dict()