Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/detector.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

65 statements  

1"""UniversalDetector — streaming encoding detection.""" 

2 

3from __future__ import annotations 

4 

5import warnings 

6from types import MappingProxyType 

7from typing import ClassVar 

8 

9from chardet import _utils 

10from chardet._utils import ( 

11 DEFAULT_MAX_BYTES, 

12 _resolve_prefer_superset, 

13 _validate_max_bytes, 

14) 

15from chardet.enums import EncodingEra, LanguageFilter 

16from chardet.equivalences import ( 

17 PREFERRED_SUPERSET, 

18 apply_compat_names, 

19 apply_preferred_superset, 

20) 

21from chardet.pipeline import DetectionDict, DetectionResult 

22from chardet.pipeline.orchestrator import run_pipeline 

23 

24_NONE_RESULT = DetectionResult(encoding=None, confidence=0.0, language=None) 

25 

26 

27class UniversalDetector: 

28 """Streaming character encoding detector. 

29 

30 Implements a feed/close pattern for incremental detection of character 

31 encoding from byte streams. Compatible with the chardet 6.x API. 

32 

33 All detection is performed by the same pipeline used by 

34 :func:`chardet.detect` and :func:`chardet.detect_all`, ensuring 

35 consistent results regardless of which API is used. 

36 

37 .. note:: 

38 

39 This class is **not** thread-safe. Each thread should create its own 

40 :class:`UniversalDetector` instance. 

41 """ 

42 

43 MINIMUM_THRESHOLD = _utils.MINIMUM_THRESHOLD 

44 # Exposed for backward compatibility with chardet 6.x callers that 

45 # reference UniversalDetector.LEGACY_MAP directly. 

46 LEGACY_MAP: ClassVar[MappingProxyType[str, str]] = MappingProxyType( 

47 PREFERRED_SUPERSET 

48 ) 

49 

50 def __init__( # noqa: PLR0913 

51 self, 

52 lang_filter: LanguageFilter = LanguageFilter.ALL, 

53 should_rename_legacy: bool = False, 

54 encoding_era: EncodingEra = EncodingEra.ALL, 

55 max_bytes: int = DEFAULT_MAX_BYTES, 

56 *, 

57 prefer_superset: bool = False, 

58 compat_names: bool = True, 

59 ) -> None: 

60 """Initialize the detector. 

61 

62 :param lang_filter: Deprecated -- accepted for backward compatibility 

63 but has no effect. A warning is emitted when set to anything 

64 other than :attr:`LanguageFilter.ALL`. 

65 :param should_rename_legacy: Deprecated alias for *prefer_superset*. 

66 :param encoding_era: Restrict candidate encodings to the given era. 

67 :param max_bytes: Maximum number of bytes to buffer from 

68 :meth:`feed` calls before stopping accumulation. 

69 :param prefer_superset: If ``True``, remap ISO subset encodings to 

70 their Windows/CP superset equivalents (e.g., ISO-8859-1 -> 

71 Windows-1252). 

72 :param compat_names: If ``True`` (default), return encoding names 

73 compatible with chardet 5.x/6.x. If ``False``, return raw Python 

74 codec names. 

75 """ 

76 if lang_filter != LanguageFilter.ALL: 

77 warnings.warn( 

78 "lang_filter is not implemented in this version of chardet " 

79 "and will be ignored", 

80 DeprecationWarning, 

81 stacklevel=2, 

82 ) 

83 prefer_superset = _resolve_prefer_superset( 

84 should_rename_legacy, prefer_superset 

85 ) 

86 self._prefer_superset = prefer_superset 

87 self._compat_names = compat_names 

88 _validate_max_bytes(max_bytes) 

89 self._encoding_era = encoding_era 

90 self._max_bytes = max_bytes 

91 self._buffer = bytearray() 

92 self._done = False 

93 self._closed = False 

94 self._result: DetectionResult | None = None 

95 

96 def feed(self, byte_str: bytes | bytearray) -> None: 

97 """Feed a chunk of bytes to the detector. 

98 

99 Data is accumulated in an internal buffer. Once *max_bytes* have 

100 been buffered, :attr:`done` is set to ``True`` and further data is 

101 ignored until :meth:`reset` is called. 

102 

103 :param byte_str: The next chunk of bytes to examine. 

104 :raises ValueError: If called after :meth:`close` without a 

105 :meth:`reset`. 

106 """ 

107 if self._closed: 

108 msg = "feed() called after close() without reset()" 

109 raise ValueError(msg) 

110 if self._done: 

111 return 

112 remaining = self._max_bytes - len(self._buffer) 

113 if remaining > 0: 

114 self._buffer.extend(byte_str[:remaining]) 

115 if len(self._buffer) >= self._max_bytes: 

116 self._done = True 

117 

118 def close(self) -> DetectionDict: 

119 """Finalize detection and return the best result. 

120 

121 Runs the full detection pipeline on the buffered data. 

122 

123 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, 

124 and ``"language"``. 

125 """ 

126 if not self._closed: 

127 self._closed = True 

128 data = bytes(self._buffer) 

129 results = run_pipeline(data, self._encoding_era, max_bytes=self._max_bytes) 

130 self._result = results[0] 

131 self._done = True 

132 return self.result 

133 

134 def reset(self) -> None: 

135 """Reset the detector to its initial state for reuse.""" 

136 self._buffer = bytearray() 

137 self._done = False 

138 self._closed = False 

139 self._result = None 

140 

141 @property 

142 def done(self) -> bool: 

143 """Whether detection is complete and no more data is needed.""" 

144 return self._done 

145 

146 @property 

147 def result(self) -> DetectionDict: 

148 """The current best detection result.""" 

149 if self._result is not None: 

150 d = self._result.to_dict() 

151 if self._prefer_superset: 

152 apply_preferred_superset(d) 

153 if self._compat_names: 

154 apply_compat_names(d) 

155 return d 

156 return _NONE_RESULT.to_dict()