Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/detector.py: 34%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""UniversalDetector — streaming encoding detection."""
3from __future__ import annotations
5import warnings
6from collections.abc import Iterable
7from types import MappingProxyType
8from typing import ClassVar
10from chardet import _utils
11from chardet._utils import (
12 DEFAULT_MAX_BYTES,
13 _resolve_prefer_superset,
14 _validate_max_bytes,
15)
16from chardet.enums import EncodingEra, LanguageFilter
17from chardet.equivalences import (
18 PREFERRED_SUPERSET,
19 apply_compat_names,
20 apply_preferred_superset,
21)
22from chardet.pipeline import _NONE_RESULT, DetectionDict, DetectionResult
23from chardet.pipeline.orchestrator import run_pipeline
24from chardet.registry import _validate_encoding, normalize_encodings
27class UniversalDetector:
28 """Streaming character encoding detector.
30 Implements a feed/close pattern for incremental detection of character
31 encoding from byte streams. Compatible with the chardet 6.x API.
33 All detection is performed by the same pipeline used by
34 :func:`chardet.detect` and :func:`chardet.detect_all`, ensuring
35 consistent results regardless of which API is used.
37 .. note::
39 This class is **not** thread-safe. Each thread should create its own
40 :class:`UniversalDetector` instance.
41 """
43 MINIMUM_THRESHOLD = _utils.MINIMUM_THRESHOLD
44 # Exposed for backward compatibility with chardet 6.x callers that
45 # reference UniversalDetector.LEGACY_MAP directly.
46 LEGACY_MAP: ClassVar[MappingProxyType[str, str]] = MappingProxyType(
47 PREFERRED_SUPERSET
48 )
50 def __init__( # noqa: PLR0913
51 self,
52 lang_filter: LanguageFilter = LanguageFilter.ALL,
53 should_rename_legacy: bool = False,
54 encoding_era: EncodingEra = EncodingEra.ALL,
55 max_bytes: int = DEFAULT_MAX_BYTES,
56 *,
57 prefer_superset: bool = False,
58 compat_names: bool = True,
59 include_encodings: Iterable[str] | None = None,
60 exclude_encodings: Iterable[str] | None = None,
61 no_match_encoding: str = "cp1252",
62 empty_input_encoding: str = "utf-8",
63 ) -> None:
64 """Initialize the detector.
66 :param lang_filter: Deprecated -- accepted for backward compatibility
67 but has no effect. A warning is emitted when set to anything
68 other than :attr:`LanguageFilter.ALL`.
69 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
70 :param encoding_era: Restrict candidate encodings to the given era.
71 :param max_bytes: Maximum number of bytes to buffer from
72 :meth:`feed` calls before stopping accumulation.
73 :param prefer_superset: If ``True``, remap ISO subset encodings to
74 their Windows/CP superset equivalents (e.g., ISO-8859-1 ->
75 Windows-1252).
76 :param compat_names: If ``True`` (default), return encoding names
77 compatible with chardet 5.x/6.x. If ``False``, return raw Python
78 codec names.
79 :param include_encodings: If given, restrict detection to only these
80 encodings (names or aliases).
81 :param exclude_encodings: If given, remove these encodings from the
82 candidate set.
83 :param no_match_encoding: Encoding to return when no candidate
84 survives the pipeline. Defaults to ``"cp1252"``.
85 :param empty_input_encoding: Encoding to return for empty input.
86 Defaults to ``"utf-8"``.
87 """
88 if lang_filter != LanguageFilter.ALL:
89 warnings.warn(
90 "lang_filter is not implemented in this version of chardet "
91 "and will be ignored",
92 DeprecationWarning,
93 stacklevel=2,
94 )
95 prefer_superset = _resolve_prefer_superset(
96 should_rename_legacy, prefer_superset
97 )
98 self._prefer_superset = prefer_superset
99 self._compat_names = compat_names
100 _validate_max_bytes(max_bytes)
101 self._encoding_era = encoding_era
102 self._max_bytes = max_bytes
103 self._include_encodings = normalize_encodings(
104 include_encodings, "include_encodings"
105 )
106 self._exclude_encodings = normalize_encodings(
107 exclude_encodings, "exclude_encodings"
108 )
109 self._no_match_encoding = _validate_encoding(
110 no_match_encoding, "no_match_encoding"
111 )
112 self._empty_input_encoding = _validate_encoding(
113 empty_input_encoding, "empty_input_encoding"
114 )
115 self._buffer = bytearray()
116 self._done = False
117 self._closed = False
118 self._result: DetectionResult | None = None
120 def feed(self, byte_str: bytes | bytearray) -> None:
121 """Feed a chunk of bytes to the detector.
123 Data is accumulated in an internal buffer. Once *max_bytes* have
124 been buffered, :attr:`done` is set to ``True`` and further data is
125 ignored until :meth:`reset` is called.
127 :param byte_str: The next chunk of bytes to examine.
128 :raises ValueError: If called after :meth:`close` without a
129 :meth:`reset`.
130 """
131 if self._closed:
132 msg = "feed() called after close() without reset()"
133 raise ValueError(msg)
134 if self._done:
135 return
136 remaining = self._max_bytes - len(self._buffer)
137 if remaining > 0:
138 self._buffer.extend(byte_str[:remaining])
139 if len(self._buffer) >= self._max_bytes:
140 self._done = True
142 def close(self) -> DetectionDict:
143 """Finalize detection and return the best result.
145 Runs the full detection pipeline on the buffered data.
147 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``,
148 and ``"language"``.
149 """
150 if not self._closed:
151 self._closed = True
152 data = bytes(self._buffer)
153 results = run_pipeline(
154 data,
155 self._encoding_era,
156 max_bytes=self._max_bytes,
157 include_encodings=self._include_encodings,
158 exclude_encodings=self._exclude_encodings,
159 no_match_encoding=self._no_match_encoding,
160 empty_input_encoding=self._empty_input_encoding,
161 )
162 self._result = results[0]
163 self._done = True
164 return self.result
166 def reset(self) -> None:
167 """Reset the detector to its initial state for reuse."""
168 self._buffer = bytearray()
169 self._done = False
170 self._closed = False
171 self._result = None
173 @property
174 def done(self) -> bool:
175 """Whether detection is complete and no more data is needed."""
176 return self._done
178 @property
179 def result(self) -> DetectionDict:
180 """The current best detection result."""
181 if self._result is not None:
182 d = self._result.to_dict()
183 if self._prefer_superset:
184 apply_preferred_superset(d)
185 if self._compat_names:
186 apply_compat_names(d)
187 return d
188 return _NONE_RESULT.to_dict()