Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 50%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Encoding equivalences and name remapping.
3This module defines:
51. **Directional supersets** for accuracy evaluation: detecting a superset
6 encoding when the expected encoding is a subset is correct (e.g., detecting
7 UTF-8 when expected is ASCII), but not the reverse.
92. **Bidirectional equivalents**: groups of encodings where detecting any
10 member when another member was expected is considered correct. This
11 includes UTF-16/UTF-32 endian variants (which encode the same text with
12 different byte order) and ISO-2022-JP branch variants (which are
13 compatible extensions of the same base encoding).
153. **Preferred superset mapping** for the ``prefer_superset`` API option:
16 replaces detected ISO/subset encoding names with their Windows/CP superset
17 equivalents that modern software actually uses.
194. **Compatibility names** for the default ``compat_names=True`` mode: maps
20 internal Python codec names to the names chardet 5.x/6.x returned,
21 preserving backward compatibility for callers that compare encoding
22 strings directly.
23"""
25from __future__ import annotations
27import unicodedata
28from collections.abc import Callable
30from chardet.pipeline import DetectionDict
31from chardet.registry import lookup_encoding
33# Directional superset relationships: detecting any of the supersets
34# when the expected encoding is the subset counts as correct.
35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii).
36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8).
37#
38# Note: some subset keys (iso-8859-11) are not in the detection
39# registry — the detector never returns them. They appear here because
40# chardet test-suite expected values use these names, so the superset
41# mapping is needed for accuracy evaluation only.
42SUPERSETS: dict[str, frozenset[str]] = {
43 "ASCII": frozenset({"utf-8", "cp1252"}),
44 "TIS-620": frozenset({"iso8859-11", "cp874"}),
45 "ISO-8859-11": frozenset({"cp874"}),
46 "GB2312": frozenset({"gb18030"}),
47 "GBK": frozenset({"gb18030"}),
48 "Big5": frozenset({"big5hkscs", "cp950"}),
49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}),
50 "Shift-JISX0213": frozenset({"shift_jis_2004"}),
51 "EUC-JP": frozenset({"euc_jis_2004"}),
52 "EUC-JISX0213": frozenset({"euc_jis_2004"}),
53 "EUC-KR": frozenset({"cp949"}),
54 "CP037": frozenset({"cp1140"}),
55 # ISO-2022-JP subsets: any branch variant is acceptable.
56 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between
57 # "ISO" and "2022") because they appear as expected values in the test suite,
58 # not as canonical chardet output. They are consumed through
59 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup().
60 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}),
61 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}),
62 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}),
63 # ISO/Windows superset pairs
64 "ISO-8859-1": frozenset({"cp1252"}),
65 "ISO-8859-2": frozenset({"cp1250"}),
66 "ISO-8859-5": frozenset({"cp1251"}),
67 "ISO-8859-6": frozenset({"cp1256"}),
68 "ISO-8859-7": frozenset({"cp1253"}),
69 "ISO-8859-8": frozenset({"cp1255"}),
70 "ISO-8859-9": frozenset({"cp1254"}),
71 "ISO-8859-13": frozenset({"cp1257"}),
72}
74# Preferred superset name for each encoding, used by the ``should_rename_legacy``
75# API option. When enabled, detected encoding names are replaced with the
76# Windows/CP superset that modern software actually uses (browsers, editors,
77# etc. treat these ISO subsets as their Windows counterparts).
78# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.
79PREFERRED_SUPERSET: dict[str, str] = {
80 "ascii": "cp1252",
81 "euc_kr": "cp949",
82 "iso8859-1": "cp1252",
83 "iso8859-2": "cp1250",
84 "iso8859-5": "cp1251",
85 "iso8859-6": "cp1256",
86 "iso8859-7": "cp1253",
87 "iso8859-8": "cp1255",
88 "iso8859-9": "cp1254",
89 "iso8859-11": "cp874",
90 "iso8859-13": "cp1257",
91 "tis-620": "cp874",
92}
95def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:
96 """Replace the encoding name using *mapping*, modifying *result* in-place."""
97 enc = result.get("encoding")
98 if isinstance(enc, str):
99 result["encoding"] = mapping.get(enc, enc)
100 return result
103def apply_preferred_superset(
104 result: DetectionDict,
105) -> DetectionDict:
106 """Replace the encoding name with its preferred Windows/CP superset.
108 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
109 for fluent chaining.
111 :param result: A detection result dict containing an ``"encoding"`` key.
112 :returns: The same *result* dict, modified in-place.
113 """
114 return _remap_encoding(result, PREFERRED_SUPERSET)
117# Deprecated alias — kept for external consumers.
118apply_legacy_rename = apply_preferred_superset
121# Mapping from Python codec names to chardet 5.x/6.x compatible display names.
122# Only entries where codec name differs from the compat output are listed.
123# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and
124# encodings new to v7 have no entry — the codec name passes through unchanged.
125_COMPAT_NAMES: dict[str, str] = {
126 # 5.x compat — these encodings existed in chardet 5.x with different names
127 "big5hkscs": "Big5",
128 "cp855": "IBM855",
129 "cp866": "IBM866",
130 "cp949": "CP949",
131 "euc_jis_2004": "EUC-JP",
132 "euc_kr": "EUC-KR",
133 "gb18030": "GB18030",
134 "hz": "HZ-GB-2312",
135 "iso2022_jp_2": "ISO-2022-JP",
136 "iso2022_kr": "ISO-2022-KR",
137 "iso8859-1": "ISO-8859-1",
138 "iso8859-5": "ISO-8859-5",
139 "iso8859-7": "ISO-8859-7",
140 "iso8859-8": "ISO-8859-8",
141 "iso8859-9": "ISO-8859-9",
142 "johab": "Johab",
143 "koi8-r": "KOI8-R",
144 "mac-cyrillic": "MacCyrillic",
145 "mac-roman": "MacRoman",
146 "shift_jis_2004": "SHIFT_JIS",
147 "tis-620": "TIS-620",
148 "utf-16": "UTF-16",
149 "utf-32": "UTF-32",
150 "utf-8-sig": "UTF-8-SIG",
151 "cp1251": "Windows-1251",
152 "cp1252": "Windows-1252",
153 "cp1253": "Windows-1253",
154 "cp1254": "Windows-1254",
155 "cp1255": "Windows-1255",
156 # 6.x compat — new in chardet 6.x with different names
157 "kz1048": "KZ1048",
158 "mac-greek": "MacGreek",
159 "mac-iceland": "MacIceland",
160 "mac-latin2": "MacLatin2",
161 "mac-turkish": "MacTurkish",
162}
164# Backward compat alias
165_LEGACY_NAMES = _COMPAT_NAMES
168def apply_compat_names(
169 result: DetectionDict,
170) -> DetectionDict:
171 """Convert internal codec names to chardet 5.x/6.x compatible names.
173 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
174 for fluent chaining.
176 :param result: A detection result dict containing an ``"encoding"`` key.
177 :returns: The same *result* dict, modified in-place.
178 """
179 return _remap_encoding(result, _COMPAT_NAMES)
182# Bidirectional equivalents -- groups where any member is acceptable for any other.
183BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = (
184 ("utf-16", "utf-16-le", "utf-16-be"),
185 ("utf-32", "utf-32-le", "utf-32-be"),
186 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"),
187)
189# Bidirectional language equivalences — groups of ISO 639-1 codes for
190# languages that are nearly indistinguishable by statistical detection.
191# Detecting any member when another member of the same group was expected
192# is considered acceptable.
193LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = (
194 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility
195 (
196 "uk",
197 "ru",
198 "bg",
199 "be",
200 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap
201 ("ms", "id"), # Malay / Indonesian — standardized variants of one language
202 (
203 "no",
204 "da",
205 "sv",
206 ), # Scandinavian — mutual intelligibility across the dialect continuum
207)
210def _build_group_index(
211 groups: tuple[tuple[str, ...], ...],
212 normalize: Callable[[str], str] = lambda x: x,
213) -> dict[str, frozenset[str]]:
214 """Build a lookup: key -> frozenset of all equivalent keys in the same group."""
215 result: dict[str, frozenset[str]] = {}
216 for group in groups:
217 normed = frozenset(normalize(n) for n in group)
218 for name in group:
219 result[normalize(name)] = normed
220 return result
223_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES)
226def is_language_equivalent(expected: str, detected: str) -> bool:
227 """Check whether *detected* is an acceptable language for *expected*.
229 Returns ``True`` when *expected* and *detected* are the same ISO 639-1
230 code, or belong to the same equivalence group in
231 :data:`LANGUAGE_EQUIVALENCES`.
233 :param expected: Expected ISO 639-1 language code.
234 :param detected: Detected ISO 639-1 language code.
235 :returns: ``True`` if the languages are equivalent.
236 """
237 if expected == detected:
238 return True
239 group = _LANGUAGE_EQUIV.get(expected)
240 return group is not None and detected in group
243# Pre-built normalized lookups for fast comparison.
244# Built iteratively because multiple SUPERSETS keys can normalize to the same
245# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004).
246# Values are merged (unioned) when keys collide.
247_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {}
248for _subset, _supersets in SUPERSETS.items():
249 _key = lookup_encoding(_subset) or _subset
250 _normed = frozenset(lookup_encoding(s) or s for s in _supersets)
251 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed
254_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index(
255 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n
256)
259def is_correct(expected: str | None, detected: str | None) -> bool:
260 """Check whether *detected* is an acceptable answer for *expected*.
262 Acceptable means:
264 1. Exact match (after normalization), OR
265 2. Both belong to the same bidirectional byte-order group, OR
266 3. *detected* is a known superset of *expected*.
268 :param expected: The expected encoding name, or ``None`` for binary files.
269 :param detected: The detected encoding name, or ``None``.
270 :returns: ``True`` if the detection is acceptable.
271 """
272 if expected is None:
273 return detected is None
274 if detected is None:
275 return False
276 norm_exp = lookup_encoding(expected) or expected.lower()
277 norm_det = lookup_encoding(detected) or detected.lower()
279 # 1. Exact match
280 if norm_exp == norm_det:
281 return True
283 # 2. Bidirectional (same byte-order group)
284 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]:
285 return True
287 # 3. Superset is acceptable (detected is a known superset of expected)
288 return (
289 norm_exp in _NORMALIZED_SUPERSETS
290 and norm_det in _NORMALIZED_SUPERSETS[norm_exp]
291 )
294def _strip_combining(text: str) -> str:
295 """NFKD-normalize *text* and strip all combining marks."""
296 nfkd = unicodedata.normalize("NFKD", text)
297 return "".join(c for c in nfkd if not unicodedata.combining(c))
300# Pre-computed symbol pair lookups for O(1) equivalence checks.
301# Both orderings are stored to avoid constructing temporaries per call.
302_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset(
303 {
304 ("¤", "€"),
305 ("€", "¤"),
306 }
307)
310def _chars_equivalent(a: str, b: str) -> bool:
311 """Return True if characters *a* and *b* are functionally equivalent.
313 Equivalent means:
314 - Same character, OR
315 - Same base letter after stripping combining marks, OR
316 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)
317 """
318 if a == b:
319 return True
320 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS:
321 return True
322 # Compare base letters after stripping combining marks.
323 return _strip_combining(a) == _strip_combining(b)
326def is_equivalent_detection(
327 data: bytes, expected: str | None, detected: str | None
328) -> bool:
329 """Check whether *detected* produces functionally identical text to *expected*.
331 Returns ``True`` when:
333 1. *detected* is not ``None`` and both encoding names normalize to the same
334 codec, OR
335 2. Decoding *data* with both encodings yields identical strings, OR
336 3. Every differing character pair is functionally equivalent: same base
337 letter after stripping combining marks, or an explicitly listed symbol
338 equivalence (e.g. ¤ ↔ €).
340 Returns ``False`` if *detected* is ``None``, either encoding is unknown,
341 or either encoding cannot decode *data*.
343 :param data: The raw byte data that was detected.
344 :param expected: The expected encoding name, or ``None`` for binary files.
345 :param detected: The detected encoding name, or ``None``.
346 :returns: ``True`` if decoding with *detected* yields functionally identical
347 text to decoding with *expected*.
348 """
349 if expected is None:
350 return detected is None
351 if detected is None:
352 return False
354 norm_exp = lookup_encoding(expected) or expected.lower()
355 norm_det = lookup_encoding(detected) or detected.lower()
357 if norm_exp == norm_det:
358 return True
360 try:
361 text_exp = data.decode(norm_exp)
362 text_det = data.decode(norm_det)
363 except (UnicodeDecodeError, LookupError):
364 return False
366 if text_exp == text_det:
367 return True
369 if len(text_exp) != len(text_det):
370 return False
372 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))