Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init__.py: 57%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Universal character encoding detector — 0BSD-licensed rewrite."""
3from __future__ import annotations
5from collections.abc import Iterable
7from chardet._utils import (
8 _DEFAULT_CHUNK_SIZE,
9 DEFAULT_MAX_BYTES,
10 MINIMUM_THRESHOLD,
11 _resolve_prefer_superset,
12 _validate_max_bytes,
13 _warn_deprecated_chunk_size,
14)
15from chardet._version import __version__
16from chardet.detector import UniversalDetector
17from chardet.enums import EncodingEra, LanguageFilter
18from chardet.equivalences import apply_compat_names, apply_preferred_superset
19from chardet.pipeline import DetectionDict, DetectionResult
20from chardet.pipeline.orchestrator import run_pipeline
21from chardet.registry import _validate_encoding, normalize_encodings
23__all__ = [
24 "DEFAULT_MAX_BYTES",
25 "MINIMUM_THRESHOLD",
26 "DetectionDict",
27 "DetectionResult",
28 "EncodingEra",
29 "LanguageFilter",
30 "UniversalDetector",
31 "__version__",
32 "detect",
33 "detect_all",
34]
37def detect( # noqa: PLR0913
38 byte_str: bytes | bytearray,
39 should_rename_legacy: bool = False,
40 encoding_era: EncodingEra = EncodingEra.ALL,
41 chunk_size: int = _DEFAULT_CHUNK_SIZE,
42 max_bytes: int = DEFAULT_MAX_BYTES,
43 *,
44 prefer_superset: bool = False,
45 compat_names: bool = True,
46 include_encodings: Iterable[str] | None = None,
47 exclude_encodings: Iterable[str] | None = None,
48 no_match_encoding: str = "cp1252",
49 empty_input_encoding: str = "utf-8",
50) -> DetectionDict:
51 """Detect the encoding of the given byte string.
53 :param byte_str: The byte sequence to detect encoding for.
54 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
55 :param encoding_era: Restrict candidate encodings to the given era.
56 :param chunk_size: Deprecated -- accepted for backward compatibility but
57 has no effect.
58 :param max_bytes: Maximum number of bytes to examine from *byte_str*.
59 :param prefer_superset: If ``True``, remap ISO subset encodings to their
60 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252).
61 :param compat_names: If ``True`` (default), return encoding names
62 compatible with chardet 5.x/6.x. If ``False``, return raw Python
63 codec names.
64 :param include_encodings: If given, restrict detection to only these
65 encodings (names or aliases).
66 :param exclude_encodings: If given, remove these encodings from the
67 candidate set.
68 :param no_match_encoding: Encoding to return when no candidate survives
69 the pipeline. Defaults to ``"cp1252"``.
70 :param empty_input_encoding: Encoding to return for empty input. Defaults
71 to ``"utf-8"``.
72 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and
73 ``"language"``.
74 """
75 _warn_deprecated_chunk_size(chunk_size)
76 _validate_max_bytes(max_bytes)
77 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)
78 include = normalize_encodings(include_encodings, "include_encodings")
79 exclude = normalize_encodings(exclude_encodings, "exclude_encodings")
80 no_match = _validate_encoding(no_match_encoding, "no_match_encoding")
81 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding")
82 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)
83 results = run_pipeline(
84 data,
85 encoding_era,
86 max_bytes=max_bytes,
87 include_encodings=include,
88 exclude_encodings=exclude,
89 no_match_encoding=no_match,
90 empty_input_encoding=empty,
91 )
92 result = results[0].to_dict()
93 if prefer_superset:
94 apply_preferred_superset(result)
95 if compat_names:
96 apply_compat_names(result)
97 return result
100def detect_all( # noqa: PLR0913
101 byte_str: bytes | bytearray,
102 ignore_threshold: bool = False,
103 should_rename_legacy: bool = False,
104 encoding_era: EncodingEra = EncodingEra.ALL,
105 chunk_size: int = _DEFAULT_CHUNK_SIZE,
106 max_bytes: int = DEFAULT_MAX_BYTES,
107 *,
108 prefer_superset: bool = False,
109 compat_names: bool = True,
110 include_encodings: Iterable[str] | None = None,
111 exclude_encodings: Iterable[str] | None = None,
112 no_match_encoding: str = "cp1252",
113 empty_input_encoding: str = "utf-8",
114) -> list[DetectionDict]:
115 """Detect all possible encodings of the given byte string.
117 When *ignore_threshold* is False (the default), results with confidence
118 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below
119 the threshold, the full unfiltered list is returned as a fallback so the
120 caller always receives at least one result.
122 :param byte_str: The byte sequence to detect encoding for.
123 :param ignore_threshold: If ``True``, return all candidate encodings
124 regardless of confidence score.
125 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
126 :param encoding_era: Restrict candidate encodings to the given era.
127 :param chunk_size: Deprecated -- accepted for backward compatibility but
128 has no effect.
129 :param max_bytes: Maximum number of bytes to examine from *byte_str*.
130 :param prefer_superset: If ``True``, remap ISO subset encodings to their
131 Windows/CP superset equivalents.
132 :param compat_names: If ``True`` (default), return encoding names
133 compatible with chardet 5.x/6.x. If ``False``, return raw Python
134 codec names.
135 :param include_encodings: If given, restrict detection to only these
136 encodings (names or aliases).
137 :param exclude_encodings: If given, remove these encodings from the
138 candidate set.
139 :param no_match_encoding: Encoding to return when no candidate survives
140 the pipeline. Defaults to ``"cp1252"``.
141 :param empty_input_encoding: Encoding to return for empty input. Defaults
142 to ``"utf-8"``.
143 :returns: A list of dictionaries, sorted by descending confidence.
144 """
145 _warn_deprecated_chunk_size(chunk_size)
146 _validate_max_bytes(max_bytes)
147 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)
148 include = normalize_encodings(include_encodings, "include_encodings")
149 exclude = normalize_encodings(exclude_encodings, "exclude_encodings")
150 no_match = _validate_encoding(no_match_encoding, "no_match_encoding")
151 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding")
152 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)
153 results = run_pipeline(
154 data,
155 encoding_era,
156 max_bytes=max_bytes,
157 include_encodings=include,
158 exclude_encodings=exclude,
159 no_match_encoding=no_match,
160 empty_input_encoding=empty,
161 )
162 dicts = [r.to_dict() for r in results]
163 if not ignore_threshold:
164 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD]
165 if filtered:
166 dicts = filtered
167 for d in dicts:
168 if prefer_superset:
169 apply_preferred_superset(d)
170 if compat_names:
171 apply_compat_names(d)
172 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)