Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/output_names.py: 93%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Public-API encoding-name remapping.
3Two output transforms applied to detection results before they cross the
4public API:
6* :func:`apply_preferred_superset` -- when the ``prefer_superset`` API option
7 is enabled, replaces detected ISO/subset encoding names with their
8 Windows/CP supersets that modern software actually uses
9 (e.g., ISO-8859-1 -> Windows-1252).
11* :func:`apply_compat_names` -- when the default ``compat_names=True`` mode
12 is enabled, maps internal Python codec names to the names chardet 5.x/6.x
13 returned, preserving backward compatibility for callers that compare
14 encoding strings directly.
16Both transforms operate in-place on a :class:`~chardet.pipeline.DetectionDict`
17and return the same dict for fluent chaining.
18"""
20from __future__ import annotations
22from chardet.pipeline import DetectionDict
24# Preferred superset name for each encoding, used by the ``prefer_superset``
25# API option. When enabled, detected encoding names are replaced with the
26# Windows/CP superset that modern software actually uses (browsers, editors,
27# etc. treat these ISO subsets as their Windows counterparts).
28# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.
29PREFERRED_SUPERSET: dict[str, str] = {
30 "ascii": "cp1252",
31 "euc_kr": "cp949",
32 "iso8859-1": "cp1252",
33 "iso8859-2": "cp1250",
34 "iso8859-5": "cp1251",
35 "iso8859-6": "cp1256",
36 "iso8859-7": "cp1253",
37 "iso8859-8": "cp1255",
38 "iso8859-9": "cp1254",
39 "iso8859-11": "cp874",
40 "iso8859-13": "cp1257",
41 "tis-620": "cp874",
42}
45# Mapping from Python codec names to chardet 5.x/6.x compatible display names.
46# Only entries where codec name differs from the compat output are listed.
47# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and
48# encodings new to v7 have no entry — the codec name passes through unchanged.
49_COMPAT_NAMES: dict[str, str] = {
50 # 5.x compat — these encodings existed in chardet 5.x with different names
51 "big5hkscs": "Big5",
52 "cp855": "IBM855",
53 "cp866": "IBM866",
54 "cp949": "CP949",
55 "euc_jis_2004": "EUC-JP",
56 "euc_kr": "EUC-KR",
57 "gb18030": "GB18030",
58 "hz": "HZ-GB-2312",
59 "iso2022_jp_2": "ISO-2022-JP",
60 "iso2022_kr": "ISO-2022-KR",
61 "iso8859-1": "ISO-8859-1",
62 "iso8859-5": "ISO-8859-5",
63 "iso8859-7": "ISO-8859-7",
64 "iso8859-8": "ISO-8859-8",
65 "iso8859-9": "ISO-8859-9",
66 "johab": "Johab",
67 "koi8-r": "KOI8-R",
68 "mac-cyrillic": "MacCyrillic",
69 "mac-roman": "MacRoman",
70 "shift_jis_2004": "SHIFT_JIS",
71 "tis-620": "TIS-620",
72 "utf-16": "UTF-16",
73 "utf-32": "UTF-32",
74 "utf-8-sig": "UTF-8-SIG",
75 "cp1251": "Windows-1251",
76 "cp1252": "Windows-1252",
77 "cp1253": "Windows-1253",
78 "cp1254": "Windows-1254",
79 "cp1255": "Windows-1255",
80 # 6.x compat — new in chardet 6.x with different names
81 "kz1048": "KZ1048",
82 "mac-greek": "MacGreek",
83 "mac-iceland": "MacIceland",
84 "mac-latin2": "MacLatin2",
85 "mac-turkish": "MacTurkish",
86}
89def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:
90 """Replace the encoding name using *mapping*, modifying *result* in-place."""
91 enc = result.get("encoding")
92 if isinstance(enc, str):
93 result["encoding"] = mapping.get(enc, enc)
94 return result
97def apply_preferred_superset(
98 result: DetectionDict,
99) -> DetectionDict:
100 """Replace the encoding name with its preferred Windows/CP superset.
102 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
103 for fluent chaining.
105 :param result: A detection result dict containing an ``"encoding"`` key.
106 :returns: The same *result* dict, modified in-place.
107 """
108 return _remap_encoding(result, PREFERRED_SUPERSET)
111# Deprecated alias — kept for external consumers.
112apply_legacy_rename = apply_preferred_superset
115def apply_compat_names(
116 result: DetectionDict,
117) -> DetectionDict:
118 """Convert internal codec names to chardet 5.x/6.x compatible names.
120 Modifies the ``"encoding"`` value in *result* in-place and returns *result*
121 for fluent chaining.
123 :param result: A detection result dict containing an ``"encoding"`` key.
124 :returns: The same *result* dict, modified in-place.
125 """
126 return _remap_encoding(result, _COMPAT_NAMES)