Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/detector.py: 35%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""UniversalDetector — streaming encoding detection."""
3from __future__ import annotations
5import warnings
6from types import MappingProxyType
7from typing import ClassVar
9from chardet import _utils
10from chardet._utils import (
11 DEFAULT_MAX_BYTES,
12 _resolve_prefer_superset,
13 _validate_max_bytes,
14)
15from chardet.enums import EncodingEra, LanguageFilter
16from chardet.equivalences import (
17 PREFERRED_SUPERSET,
18 apply_compat_names,
19 apply_preferred_superset,
20)
21from chardet.pipeline import DetectionDict, DetectionResult
22from chardet.pipeline.orchestrator import run_pipeline
24_NONE_RESULT = DetectionResult(encoding=None, confidence=0.0, language=None)
27class UniversalDetector:
28 """Streaming character encoding detector.
30 Implements a feed/close pattern for incremental detection of character
31 encoding from byte streams. Compatible with the chardet 6.x API.
33 All detection is performed by the same pipeline used by
34 :func:`chardet.detect` and :func:`chardet.detect_all`, ensuring
35 consistent results regardless of which API is used.
37 .. note::
39 This class is **not** thread-safe. Each thread should create its own
40 :class:`UniversalDetector` instance.
41 """
43 MINIMUM_THRESHOLD = _utils.MINIMUM_THRESHOLD
44 # Exposed for backward compatibility with chardet 6.x callers that
45 # reference UniversalDetector.LEGACY_MAP directly.
46 LEGACY_MAP: ClassVar[MappingProxyType[str, str]] = MappingProxyType(
47 PREFERRED_SUPERSET
48 )
50 def __init__( # noqa: PLR0913
51 self,
52 lang_filter: LanguageFilter = LanguageFilter.ALL,
53 should_rename_legacy: bool = False,
54 encoding_era: EncodingEra = EncodingEra.ALL,
55 max_bytes: int = DEFAULT_MAX_BYTES,
56 *,
57 prefer_superset: bool = False,
58 compat_names: bool = True,
59 ) -> None:
60 """Initialize the detector.
62 :param lang_filter: Deprecated -- accepted for backward compatibility
63 but has no effect. A warning is emitted when set to anything
64 other than :attr:`LanguageFilter.ALL`.
65 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
66 :param encoding_era: Restrict candidate encodings to the given era.
67 :param max_bytes: Maximum number of bytes to buffer from
68 :meth:`feed` calls before stopping accumulation.
69 :param prefer_superset: If ``True``, remap ISO subset encodings to
70 their Windows/CP superset equivalents (e.g., ISO-8859-1 ->
71 Windows-1252).
72 :param compat_names: If ``True`` (default), return encoding names
73 compatible with chardet 5.x/6.x. If ``False``, return raw Python
74 codec names.
75 """
76 if lang_filter != LanguageFilter.ALL:
77 warnings.warn(
78 "lang_filter is not implemented in this version of chardet "
79 "and will be ignored",
80 DeprecationWarning,
81 stacklevel=2,
82 )
83 prefer_superset = _resolve_prefer_superset(
84 should_rename_legacy, prefer_superset
85 )
86 self._prefer_superset = prefer_superset
87 self._compat_names = compat_names
88 _validate_max_bytes(max_bytes)
89 self._encoding_era = encoding_era
90 self._max_bytes = max_bytes
91 self._buffer = bytearray()
92 self._done = False
93 self._closed = False
94 self._result: DetectionResult | None = None
96 def feed(self, byte_str: bytes | bytearray) -> None:
97 """Feed a chunk of bytes to the detector.
99 Data is accumulated in an internal buffer. Once *max_bytes* have
100 been buffered, :attr:`done` is set to ``True`` and further data is
101 ignored until :meth:`reset` is called.
103 :param byte_str: The next chunk of bytes to examine.
104 :raises ValueError: If called after :meth:`close` without a
105 :meth:`reset`.
106 """
107 if self._closed:
108 msg = "feed() called after close() without reset()"
109 raise ValueError(msg)
110 if self._done:
111 return
112 remaining = self._max_bytes - len(self._buffer)
113 if remaining > 0:
114 self._buffer.extend(byte_str[:remaining])
115 if len(self._buffer) >= self._max_bytes:
116 self._done = True
118 def close(self) -> DetectionDict:
119 """Finalize detection and return the best result.
121 Runs the full detection pipeline on the buffered data.
123 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``,
124 and ``"language"``.
125 """
126 if not self._closed:
127 self._closed = True
128 data = bytes(self._buffer)
129 results = run_pipeline(data, self._encoding_era, max_bytes=self._max_bytes)
130 self._result = results[0]
131 self._done = True
132 return self.result
134 def reset(self) -> None:
135 """Reset the detector to its initial state for reuse."""
136 self._buffer = bytearray()
137 self._done = False
138 self._closed = False
139 self._result = None
141 @property
142 def done(self) -> bool:
143 """Whether detection is complete and no more data is needed."""
144 return self._done
146 @property
147 def result(self) -> DetectionDict:
148 """The current best detection result."""
149 if self._result is not None:
150 d = self._result.to_dict()
151 if self._prefer_superset:
152 apply_preferred_superset(d)
153 if self._compat_names:
154 apply_compat_names(d)
155 return d
156 return _NONE_RESULT.to_dict()