Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 49%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Encoding equivalences and name remapping.
3This module defines:
51. **Directional supersets** for accuracy evaluation: detecting a superset
6 encoding when the expected encoding is a subset is correct (e.g., detecting
7 UTF-8 when expected is ASCII), but not the reverse.
92. **Bidirectional equivalents**: groups of encodings where detecting any
10 member when another member was expected is considered correct. This
11 includes UTF-16/UTF-32 endian variants (which encode the same text with
12 different byte order) and ISO-2022-JP branch variants (which are
13 compatible extensions of the same base encoding).
153. **Preferred superset mapping** for the ``prefer_superset`` API option:
16 replaces detected ISO/subset encoding names with their Windows/CP superset
17 equivalents that modern software actually uses.
194. **Compatibility names** for the default ``compat_names=True`` mode: maps
20 internal Python codec names to the names chardet 5.x/6.x returned,
21 preserving backward compatibility for callers that compare encoding
22 strings directly.
23"""
25from __future__ import annotations
27import unicodedata
28from collections.abc import Callable
30from chardet.pipeline import DetectionDict
31from chardet.registry import lookup_encoding
33# Directional superset relationships: detecting any of the supersets
34# when the expected encoding is the subset counts as correct.
35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii).
36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8).
37#
38# Note: some subset keys (iso-8859-11) are not in the detection
39# registry — the detector never returns them. They appear here because
40# chardet test-suite expected values use these names, so the superset
41# mapping is needed for accuracy evaluation only.
42SUPERSETS: dict[str, frozenset[str]] = {
43 "ASCII": frozenset({"utf-8", "cp1252"}),
44 "TIS-620": frozenset({"iso8859-11", "cp874"}),
45 "ISO-8859-11": frozenset({"cp874"}),
46 "GB2312": frozenset({"gb18030"}),
47 "GBK": frozenset({"gb18030"}),
48 "Big5": frozenset({"big5hkscs", "cp950"}),
49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}),
50 "Shift-JISX0213": frozenset({"shift_jis_2004"}),
51 "EUC-JP": frozenset({"euc_jis_2004"}),
52 "EUC-JISX0213": frozenset({"euc_jis_2004"}),
53 "EUC-KR": frozenset({"cp949"}),
54 "CP037": frozenset({"cp1140"}),
55 # ISO-2022-JP subsets: any branch variant is acceptable.
56 # In our registry, base ISO-2022-JP is an alias of iso2022_jp_2, so all
57 # three extended variants are supersets of the same base. While the
58 # extended variants use different escape sequences for non-basic characters,
59 # real-world files rarely use those extensions — the base JIS X 0208
60 # character set is shared by all variants and cross-decodes identically.
61 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between
62 # "ISO" and "2022") because they appear as expected values in the test suite,
63 # not as canonical chardet output. They are consumed through
64 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup().
65 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}),
66 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}),
67 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}),
68 # ISO/Windows superset pairs
69 "ISO-8859-1": frozenset({"cp1252"}),
70 "ISO-8859-2": frozenset({"cp1250"}),
71 "ISO-8859-5": frozenset({"cp1251"}),
72 "ISO-8859-6": frozenset({"cp1256"}),
73 "ISO-8859-7": frozenset({"cp1253"}),
74 "ISO-8859-8": frozenset({"cp1255"}),
75 "ISO-8859-9": frozenset({"cp1254"}),
76 "ISO-8859-13": frozenset({"cp1257"}),
77 # UTF-16/32: bare form (BOM-aware) is interchangeable with either endianness,
78 # but LE and BE are NOT interchangeable with each other.
79 "UTF-16": frozenset({"utf-16-le", "utf-16-be"}),
80 "UTF-16-LE": frozenset({"utf-16"}),
81 "UTF-16-BE": frozenset({"utf-16"}),
82 "UTF-32": frozenset({"utf-32-le", "utf-32-be"}),
83 "UTF-32-LE": frozenset({"utf-32"}),
84 "UTF-32-BE": frozenset({"utf-32"}),
85}
87# Preferred superset name for each encoding, used by the ``should_rename_legacy``
88# API option. When enabled, detected encoding names are replaced with the
89# Windows/CP superset that modern software actually uses (browsers, editors,
90# etc. treat these ISO subsets as their Windows counterparts).
91# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.
92PREFERRED_SUPERSET: dict[str, str] = {
93 "ascii": "cp1252",
94 "euc_kr": "cp949",
95 "iso8859-1": "cp1252",
96 "iso8859-2": "cp1250",
97 "iso8859-5": "cp1251",
98 "iso8859-6": "cp1256",
99 "iso8859-7": "cp1253",
100 "iso8859-8": "cp1255",
101 "iso8859-9": "cp1254",
102 "iso8859-11": "cp874",
103 "iso8859-13": "cp1257",
104 "tis-620": "cp874",
105}
108def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:
109 """Replace the encoding name using *mapping*, modifying *result* in-place."""
110 enc = result.get("encoding")
111 if isinstance(enc, str):
112 result["encoding"] = mapping.get(enc, enc)
113 return result
116def apply_preferred_superset(
117 result: DetectionDict,
118) -> DetectionDict:
119 """Replace the encoding name with its preferred Windows/CP superset.
121 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
122 for fluent chaining.
124 :param result: A detection result dict containing an ``"encoding"`` key.
125 :returns: The same *result* dict, modified in-place.
126 """
127 return _remap_encoding(result, PREFERRED_SUPERSET)
130# Deprecated alias — kept for external consumers.
131apply_legacy_rename = apply_preferred_superset
134# Mapping from Python codec names to chardet 5.x/6.x compatible display names.
135# Only entries where codec name differs from the compat output are listed.
136# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and
137# encodings new to v7 have no entry — the codec name passes through unchanged.
138_COMPAT_NAMES: dict[str, str] = {
139 # 5.x compat — these encodings existed in chardet 5.x with different names
140 "big5hkscs": "Big5",
141 "cp855": "IBM855",
142 "cp866": "IBM866",
143 "cp949": "CP949",
144 "euc_jis_2004": "EUC-JP",
145 "euc_kr": "EUC-KR",
146 "gb18030": "GB18030",
147 "hz": "HZ-GB-2312",
148 "iso2022_jp_2": "ISO-2022-JP",
149 "iso2022_kr": "ISO-2022-KR",
150 "iso8859-1": "ISO-8859-1",
151 "iso8859-5": "ISO-8859-5",
152 "iso8859-7": "ISO-8859-7",
153 "iso8859-8": "ISO-8859-8",
154 "iso8859-9": "ISO-8859-9",
155 "johab": "Johab",
156 "koi8-r": "KOI8-R",
157 "mac-cyrillic": "MacCyrillic",
158 "mac-roman": "MacRoman",
159 "shift_jis_2004": "SHIFT_JIS",
160 "tis-620": "TIS-620",
161 "utf-16": "UTF-16",
162 "utf-32": "UTF-32",
163 "utf-8-sig": "UTF-8-SIG",
164 "cp1251": "Windows-1251",
165 "cp1252": "Windows-1252",
166 "cp1253": "Windows-1253",
167 "cp1254": "Windows-1254",
168 "cp1255": "Windows-1255",
169 # 6.x compat — new in chardet 6.x with different names
170 "kz1048": "KZ1048",
171 "mac-greek": "MacGreek",
172 "mac-iceland": "MacIceland",
173 "mac-latin2": "MacLatin2",
174 "mac-turkish": "MacTurkish",
175}
178def apply_compat_names(
179 result: DetectionDict,
180) -> DetectionDict:
181 """Convert internal codec names to chardet 5.x/6.x compatible names.
183 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
184 for fluent chaining.
186 :param result: A detection result dict containing an ``"encoding"`` key.
187 :returns: The same *result* dict, modified in-place.
188 """
189 return _remap_encoding(result, _COMPAT_NAMES)
192# Bidirectional equivalents -- groups where any member is acceptable for any other.
193# Bidirectional equivalents -- groups where any member is acceptable for any other.
194#
195# NOTE: UTF-16/32 endianness is handled via directional SUPERSETS instead,
196# because wrong endianness garbles text. ISO-2022-JP variants remain here
197# because base ISO-2022-JP is an alias of iso2022_jp_2 in our registry, so
198# the SUPERSETS entries already make all variants interchangeable via the
199# shared base.
200BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = (
201 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"),
202)
204# Bidirectional language equivalences — groups of ISO 639-1 codes for
205# languages that are nearly indistinguishable by statistical detection.
206# Detecting any member when another member of the same group was expected
207# is considered acceptable.
208LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = (
209 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility
210 (
211 "uk",
212 "ru",
213 "bg",
214 "be",
215 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap
216 ("ms", "id"), # Malay / Indonesian — standardized variants of one language
217 (
218 "no",
219 "da",
220 "sv",
221 ), # Scandinavian — mutual intelligibility across the dialect continuum
222)
225def _build_group_index(
226 groups: tuple[tuple[str, ...], ...],
227 normalize: Callable[[str], str] = lambda x: x,
228) -> dict[str, frozenset[str]]:
229 """Build a lookup: key -> frozenset of all equivalent keys in the same group."""
230 result: dict[str, frozenset[str]] = {}
231 for group in groups:
232 normed = frozenset(normalize(n) for n in group)
233 for name in group:
234 result[normalize(name)] = normed
235 return result
238_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES)
241def is_language_equivalent(expected: str, detected: str) -> bool:
242 """Check whether *detected* is an acceptable language for *expected*.
244 Returns ``True`` when *expected* and *detected* are the same ISO 639-1
245 code, or belong to the same equivalence group in
246 :data:`LANGUAGE_EQUIVALENCES`.
248 :param expected: Expected ISO 639-1 language code.
249 :param detected: Detected ISO 639-1 language code.
250 :returns: ``True`` if the languages are equivalent.
251 """
252 if expected == detected:
253 return True
254 group = _LANGUAGE_EQUIV.get(expected)
255 return group is not None and detected in group
258# Pre-built normalized lookups for fast comparison.
259# Built iteratively because multiple SUPERSETS keys can normalize to the same
260# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004).
261# Values are merged (unioned) when keys collide.
262_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {}
263for _subset, _supersets in SUPERSETS.items():
264 _key = lookup_encoding(_subset) or _subset
265 _normed = frozenset(lookup_encoding(s) or s for s in _supersets)
266 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed
269_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index(
270 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n
271)
274def is_correct(expected: str | None, detected: str | None) -> bool:
275 """Check whether *detected* is an acceptable answer for *expected*.
277 Acceptable means:
279 1. Exact match (after normalization), OR
280 2. Both belong to the same bidirectional byte-order group, OR
281 3. *detected* is a known superset of *expected*.
283 :param expected: The expected encoding name, or ``None`` for binary files.
284 :param detected: The detected encoding name, or ``None``.
285 :returns: ``True`` if the detection is acceptable.
286 """
287 if expected is None:
288 return detected is None
289 if detected is None:
290 return False
291 norm_exp = lookup_encoding(expected) or expected.lower()
292 norm_det = lookup_encoding(detected) or detected.lower()
294 # 1. Exact match
295 if norm_exp == norm_det:
296 return True
298 # 2. Bidirectional (same byte-order group)
299 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]:
300 return True
302 # 3. Superset is acceptable (detected is a known superset of expected)
303 return (
304 norm_exp in _NORMALIZED_SUPERSETS
305 and norm_det in _NORMALIZED_SUPERSETS[norm_exp]
306 )
309def _strip_combining(text: str) -> str:
310 """NFKD-normalize *text* and strip all combining marks."""
311 nfkd = unicodedata.normalize("NFKD", text)
312 return "".join(c for c in nfkd if not unicodedata.combining(c))
315# Pre-computed symbol pair lookups for O(1) equivalence checks.
316# Both orderings are stored to avoid constructing temporaries per call.
317_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset(
318 {
319 ("¤", "€"),
320 ("€", "¤"),
321 }
322)
325def _chars_equivalent(a: str, b: str) -> bool:
326 """Return True if characters *a* and *b* are functionally equivalent.
328 Equivalent means:
329 - Same character, OR
330 - Same base letter after stripping combining marks, OR
331 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)
332 """
333 if a == b:
334 return True
335 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS:
336 return True
337 # Compare base letters after stripping combining marks.
338 return _strip_combining(a) == _strip_combining(b)
341def is_equivalent_detection(
342 data: bytes, expected: str | None, detected: str | None
343) -> bool:
344 """Check whether *detected* produces functionally identical text to *expected*.
346 Returns ``True`` when:
348 1. *detected* is not ``None`` and both encoding names normalize to the same
349 codec, OR
350 2. Decoding *data* with both encodings yields identical strings, OR
351 3. Every differing character pair is functionally equivalent: same base
352 letter after stripping combining marks, or an explicitly listed symbol
353 equivalence (e.g. ¤ ↔ €).
355 Returns ``False`` if *detected* is ``None``, either encoding is unknown,
356 or either encoding cannot decode *data*.
358 :param data: The raw byte data that was detected.
359 :param expected: The expected encoding name, or ``None`` for binary files.
360 :param detected: The detected encoding name, or ``None``.
361 :returns: ``True`` if decoding with *detected* yields functionally identical
362 text to decoding with *expected*.
363 """
364 if expected is None:
365 return detected is None
366 if detected is None:
367 return False
369 norm_exp = lookup_encoding(expected) or expected.lower()
370 norm_det = lookup_encoding(detected) or detected.lower()
372 if norm_exp == norm_det:
373 return True
375 try:
376 text_exp = data.decode(norm_exp)
377 text_det = data.decode(norm_det)
378 except (UnicodeDecodeError, LookupError):
379 return False
381 if text_exp == text_det:
382 return True
384 if len(text_exp) != len(text_det):
385 return False
387 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))