Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init__.py: 56%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Universal character encoding detector — MIT-licensed rewrite."""
3from __future__ import annotations
5from chardet._utils import (
6 _DEFAULT_CHUNK_SIZE,
7 DEFAULT_MAX_BYTES,
8 MINIMUM_THRESHOLD,
9 _resolve_prefer_superset,
10 _validate_max_bytes,
11 _warn_deprecated_chunk_size,
12)
13from chardet._version import __version__
14from chardet.detector import UniversalDetector
15from chardet.enums import EncodingEra, LanguageFilter
16from chardet.equivalences import apply_compat_names, apply_preferred_superset
17from chardet.pipeline import DetectionDict, DetectionResult
18from chardet.pipeline.orchestrator import run_pipeline
20__all__ = [
21 "DEFAULT_MAX_BYTES",
22 "MINIMUM_THRESHOLD",
23 "DetectionDict",
24 "DetectionResult",
25 "EncodingEra",
26 "LanguageFilter",
27 "UniversalDetector",
28 "__version__",
29 "detect",
30 "detect_all",
31]
34def detect( # noqa: PLR0913
35 byte_str: bytes | bytearray,
36 should_rename_legacy: bool = False,
37 encoding_era: EncodingEra = EncodingEra.ALL,
38 chunk_size: int = _DEFAULT_CHUNK_SIZE,
39 max_bytes: int = DEFAULT_MAX_BYTES,
40 *,
41 prefer_superset: bool = False,
42 compat_names: bool = True,
43) -> DetectionDict:
44 """Detect the encoding of the given byte string.
46 :param byte_str: The byte sequence to detect encoding for.
47 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
48 :param encoding_era: Restrict candidate encodings to the given era.
49 :param chunk_size: Deprecated -- accepted for backward compatibility but
50 has no effect.
51 :param max_bytes: Maximum number of bytes to examine from *byte_str*.
52 :param prefer_superset: If ``True``, remap ISO subset encodings to their
53 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252).
54 :param compat_names: If ``True`` (default), return encoding names
55 compatible with chardet 5.x/6.x. If ``False``, return raw Python
56 codec names.
57 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and
58 ``"language"``.
59 """
60 _warn_deprecated_chunk_size(chunk_size)
61 _validate_max_bytes(max_bytes)
62 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)
63 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)
64 results = run_pipeline(data, encoding_era, max_bytes=max_bytes)
65 result = results[0].to_dict()
66 if prefer_superset:
67 apply_preferred_superset(result)
68 if compat_names:
69 apply_compat_names(result)
70 return result
73def detect_all( # noqa: PLR0913
74 byte_str: bytes | bytearray,
75 ignore_threshold: bool = False,
76 should_rename_legacy: bool = False,
77 encoding_era: EncodingEra = EncodingEra.ALL,
78 chunk_size: int = _DEFAULT_CHUNK_SIZE,
79 max_bytes: int = DEFAULT_MAX_BYTES,
80 *,
81 prefer_superset: bool = False,
82 compat_names: bool = True,
83) -> list[DetectionDict]:
84 """Detect all possible encodings of the given byte string.
86 When *ignore_threshold* is False (the default), results with confidence
87 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below
88 the threshold, the full unfiltered list is returned as a fallback so the
89 caller always receives at least one result.
91 :param byte_str: The byte sequence to detect encoding for.
92 :param ignore_threshold: If ``True``, return all candidate encodings
93 regardless of confidence score.
94 :param should_rename_legacy: Deprecated alias for *prefer_superset*.
95 :param encoding_era: Restrict candidate encodings to the given era.
96 :param chunk_size: Deprecated -- accepted for backward compatibility but
97 has no effect.
98 :param max_bytes: Maximum number of bytes to examine from *byte_str*.
99 :param prefer_superset: If ``True``, remap ISO subset encodings to their
100 Windows/CP superset equivalents.
101 :param compat_names: If ``True`` (default), return encoding names
102 compatible with chardet 5.x/6.x. If ``False``, return raw Python
103 codec names.
104 :returns: A list of dictionaries, sorted by descending confidence.
105 """
106 _warn_deprecated_chunk_size(chunk_size)
107 _validate_max_bytes(max_bytes)
108 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)
109 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)
110 results = run_pipeline(data, encoding_era, max_bytes=max_bytes)
111 dicts = [r.to_dict() for r in results]
112 if not ignore_threshold:
113 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD]
114 if filtered:
115 dicts = filtered
116 for d in dicts:
117 if prefer_superset:
118 apply_preferred_superset(d)
119 if compat_names:
120 apply_compat_names(d)
121 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)