Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 50%

1"""Encoding equivalences and name remapping.

3This module defines:

51. **Directional supersets** for accuracy evaluation: detecting a superset

6 encoding when the expected encoding is a subset is correct (e.g., detecting

7 UTF-8 when expected is ASCII), but not the reverse.

92. **Bidirectional equivalents**: groups of encodings where detecting any

10 member when another member was expected is considered correct. This

11 includes UTF-16/UTF-32 endian variants (which encode the same text with

12 different byte order) and ISO-2022-JP branch variants (which are

13 compatible extensions of the same base encoding).

153. **Preferred superset mapping** for the ``prefer_superset`` API option:

16 replaces detected ISO/subset encoding names with their Windows/CP superset

17 equivalents that modern software actually uses.

194. **Compatibility names** for the default ``compat_names=True`` mode: maps

20 internal Python codec names to the names chardet 5.x/6.x returned,

21 preserving backward compatibility for callers that compare encoding

22 strings directly.

23"""

25from __future__ import annotations

27import unicodedata

28from collections.abc import Callable

30from chardet.pipeline import DetectionDict

31from chardet.registry import lookup_encoding

33# Directional superset relationships: detecting any of the supersets

34# when the expected encoding is the subset counts as correct.

35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii).

36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8).

37#

38# Note: some subset keys (iso-8859-11) are not in the detection

39# registry — the detector never returns them. They appear here because

40# chardet test-suite expected values use these names, so the superset

41# mapping is needed for accuracy evaluation only.

42SUPERSETS: dict[str, frozenset[str]] = {

43 "ASCII": frozenset({"utf-8", "cp1252"}),

44 "TIS-620": frozenset({"iso8859-11", "cp874"}),

45 "ISO-8859-11": frozenset({"cp874"}),

46 "GB2312": frozenset({"gb18030"}),

47 "GBK": frozenset({"gb18030"}),

48 "Big5": frozenset({"big5hkscs", "cp950"}),

49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}),

50 "Shift-JISX0213": frozenset({"shift_jis_2004"}),

51 "EUC-JP": frozenset({"euc_jis_2004"}),

52 "EUC-JISX0213": frozenset({"euc_jis_2004"}),

53 "EUC-KR": frozenset({"cp949"}),

54 "CP037": frozenset({"cp1140"}),

55 # ISO-2022-JP subsets: any branch variant is acceptable.

56 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between

57 # "ISO" and "2022") because they appear as expected values in the test suite,

58 # not as canonical chardet output. They are consumed through

59 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup().

60 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}),

61 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}),

62 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}),

63 # ISO/Windows superset pairs

64 "ISO-8859-1": frozenset({"cp1252"}),

65 "ISO-8859-2": frozenset({"cp1250"}),

66 "ISO-8859-5": frozenset({"cp1251"}),

67 "ISO-8859-6": frozenset({"cp1256"}),

68 "ISO-8859-7": frozenset({"cp1253"}),

69 "ISO-8859-8": frozenset({"cp1255"}),

70 "ISO-8859-9": frozenset({"cp1254"}),

71 "ISO-8859-13": frozenset({"cp1257"}),

72}

74# Preferred superset name for each encoding, used by the ``should_rename_legacy``

75# API option. When enabled, detected encoding names are replaced with the

76# Windows/CP superset that modern software actually uses (browsers, editors,

77# etc. treat these ISO subsets as their Windows counterparts).

78# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.

79PREFERRED_SUPERSET: dict[str, str] = {

80 "ascii": "cp1252",

81 "euc_kr": "cp949",

82 "iso8859-1": "cp1252",

83 "iso8859-2": "cp1250",

84 "iso8859-5": "cp1251",

85 "iso8859-6": "cp1256",

86 "iso8859-7": "cp1253",

87 "iso8859-8": "cp1255",

88 "iso8859-9": "cp1254",

89 "iso8859-11": "cp874",

90 "iso8859-13": "cp1257",

91 "tis-620": "cp874",

92}

95def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:

96 """Replace the encoding name using *mapping*, modifying *result* in-place."""

97 enc = result.get("encoding")

98 if isinstance(enc, str):

99 result["encoding"] = mapping.get(enc, enc)

100 return result

101

102

103def apply_preferred_superset(

104 result: DetectionDict,

105) -> DetectionDict:

106 """Replace the encoding name with its preferred Windows/CP superset.

107

108 Modifies the ``"encoding"`` value in *result* in-place and returns *result*

109 for fluent chaining.

110

111 :param result: A detection result dict containing an ``"encoding"`` key.

112 :returns: The same *result* dict, modified in-place.

113 """

114 return _remap_encoding(result, PREFERRED_SUPERSET)

115

116

117# Deprecated alias — kept for external consumers.

118apply_legacy_rename = apply_preferred_superset

119

120

121# Mapping from Python codec names to chardet 5.x/6.x compatible display names.

122# Only entries where codec name differs from the compat output are listed.

123# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and

124# encodings new to v7 have no entry — the codec name passes through unchanged.

125_COMPAT_NAMES: dict[str, str] = {

126 # 5.x compat — these encodings existed in chardet 5.x with different names

127 "big5hkscs": "Big5",

128 "cp855": "IBM855",

129 "cp866": "IBM866",

130 "cp949": "CP949",

131 "euc_jis_2004": "EUC-JP",

132 "euc_kr": "EUC-KR",

133 "gb18030": "GB18030",

134 "hz": "HZ-GB-2312",

135 "iso2022_jp_2": "ISO-2022-JP",

136 "iso2022_kr": "ISO-2022-KR",

137 "iso8859-1": "ISO-8859-1",

138 "iso8859-5": "ISO-8859-5",

139 "iso8859-7": "ISO-8859-7",

140 "iso8859-8": "ISO-8859-8",

141 "iso8859-9": "ISO-8859-9",

142 "johab": "Johab",

143 "koi8-r": "KOI8-R",

144 "mac-cyrillic": "MacCyrillic",

145 "mac-roman": "MacRoman",

146 "shift_jis_2004": "SHIFT_JIS",

147 "tis-620": "TIS-620",

148 "utf-16": "UTF-16",

149 "utf-32": "UTF-32",

150 "utf-8-sig": "UTF-8-SIG",

151 "cp1251": "Windows-1251",

152 "cp1252": "Windows-1252",

153 "cp1253": "Windows-1253",

154 "cp1254": "Windows-1254",

155 "cp1255": "Windows-1255",

156 # 6.x compat — new in chardet 6.x with different names

157 "kz1048": "KZ1048",

158 "mac-greek": "MacGreek",

159 "mac-iceland": "MacIceland",

160 "mac-latin2": "MacLatin2",

161 "mac-turkish": "MacTurkish",

162}

163

164# Backward compat alias

165_LEGACY_NAMES = _COMPAT_NAMES

166

167

168def apply_compat_names(

169 result: DetectionDict,

170) -> DetectionDict:

171 """Convert internal codec names to chardet 5.x/6.x compatible names.

172

173 Modifies the ``"encoding"`` value in *result* in-place and returns *result*

174 for fluent chaining.

175

176 :param result: A detection result dict containing an ``"encoding"`` key.

177 :returns: The same *result* dict, modified in-place.

178 """

179 return _remap_encoding(result, _COMPAT_NAMES)

180

181

182# Bidirectional equivalents -- groups where any member is acceptable for any other.

183BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = (

184 ("utf-16", "utf-16-le", "utf-16-be"),

185 ("utf-32", "utf-32-le", "utf-32-be"),

186 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"),

187)

188

189# Bidirectional language equivalences — groups of ISO 639-1 codes for

190# languages that are nearly indistinguishable by statistical detection.

191# Detecting any member when another member of the same group was expected

192# is considered acceptable.

193LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = (

194 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility

195 (

196 "uk",

197 "ru",

198 "bg",

199 "be",

200 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap

201 ("ms", "id"), # Malay / Indonesian — standardized variants of one language

202 (

203 "no",

204 "da",

205 "sv",

206 ), # Scandinavian — mutual intelligibility across the dialect continuum

207)

208

209

210def _build_group_index(

211 groups: tuple[tuple[str, ...], ...],

212 normalize: Callable[[str], str] = lambda x: x,

213) -> dict[str, frozenset[str]]:

214 """Build a lookup: key -> frozenset of all equivalent keys in the same group."""

215 result: dict[str, frozenset[str]] = {}

216 for group in groups:

217 normed = frozenset(normalize(n) for n in group)

218 for name in group:

219 result[normalize(name)] = normed

220 return result

221

222

223_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES)

224

225

226def is_language_equivalent(expected: str, detected: str) -> bool:

227 """Check whether *detected* is an acceptable language for *expected*.

228

229 Returns ``True`` when *expected* and *detected* are the same ISO 639-1

230 code, or belong to the same equivalence group in

231 :data:`LANGUAGE_EQUIVALENCES`.

232

233 :param expected: Expected ISO 639-1 language code.

234 :param detected: Detected ISO 639-1 language code.

235 :returns: ``True`` if the languages are equivalent.

236 """

237 if expected == detected:

238 return True

239 group = _LANGUAGE_EQUIV.get(expected)

240 return group is not None and detected in group

241

242

243# Pre-built normalized lookups for fast comparison.

244# Built iteratively because multiple SUPERSETS keys can normalize to the same

245# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004).

246# Values are merged (unioned) when keys collide.

247_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {}

248for _subset, _supersets in SUPERSETS.items():

249 _key = lookup_encoding(_subset) or _subset

250 _normed = frozenset(lookup_encoding(s) or s for s in _supersets)

251 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed

252

253

254_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index(

255 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n

256)

257

258

259def is_correct(expected: str | None, detected: str | None) -> bool:

260 """Check whether *detected* is an acceptable answer for *expected*.

261

262 Acceptable means:

263

264 1. Exact match (after normalization), OR

265 2. Both belong to the same bidirectional byte-order group, OR

266 3. *detected* is a known superset of *expected*.

267

268 :param expected: The expected encoding name, or ``None`` for binary files.

269 :param detected: The detected encoding name, or ``None``.

270 :returns: ``True`` if the detection is acceptable.

271 """

272 if expected is None:

273 return detected is None

274 if detected is None:

275 return False

276 norm_exp = lookup_encoding(expected) or expected.lower()

277 norm_det = lookup_encoding(detected) or detected.lower()

278

279 # 1. Exact match

280 if norm_exp == norm_det:

281 return True

282

283 # 2. Bidirectional (same byte-order group)

284 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]:

285 return True

286

287 # 3. Superset is acceptable (detected is a known superset of expected)

288 return (

289 norm_exp in _NORMALIZED_SUPERSETS

290 and norm_det in _NORMALIZED_SUPERSETS[norm_exp]

291 )

292

293

294def _strip_combining(text: str) -> str:

295 """NFKD-normalize *text* and strip all combining marks."""

296 nfkd = unicodedata.normalize("NFKD", text)

297 return "".join(c for c in nfkd if not unicodedata.combining(c))

298

299

300# Pre-computed symbol pair lookups for O(1) equivalence checks.

301# Both orderings are stored to avoid constructing temporaries per call.

302_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset(

303 {

304 ("¤", "€"),

305 ("€", "¤"),

306 }

307)

308

309

310def _chars_equivalent(a: str, b: str) -> bool:

311 """Return True if characters *a* and *b* are functionally equivalent.

312

313 Equivalent means:

314 - Same character, OR

315 - Same base letter after stripping combining marks, OR

316 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)

317 """

318 if a == b:

319 return True

320 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS:

321 return True

322 # Compare base letters after stripping combining marks.

323 return _strip_combining(a) == _strip_combining(b)

324

325

326def is_equivalent_detection(

327 data: bytes, expected: str | None, detected: str | None

328) -> bool:

329 """Check whether *detected* produces functionally identical text to *expected*.

330

331 Returns ``True`` when:

332

333 1. *detected* is not ``None`` and both encoding names normalize to the same

334 codec, OR

335 2. Decoding *data* with both encodings yields identical strings, OR

336 3. Every differing character pair is functionally equivalent: same base

337 letter after stripping combining marks, or an explicitly listed symbol

338 equivalence (e.g. ¤ ↔ €).

339

340 Returns ``False`` if *detected* is ``None``, either encoding is unknown,

341 or either encoding cannot decode *data*.

342

343 :param data: The raw byte data that was detected.

344 :param expected: The expected encoding name, or ``None`` for binary files.

345 :param detected: The detected encoding name, or ``None``.

346 :returns: ``True`` if decoding with *detected* yields functionally identical

347 text to decoding with *expected*.

348 """

349 if expected is None:

350 return detected is None

351 if detected is None:

352 return False

353

354 norm_exp = lookup_encoding(expected) or expected.lower()

355 norm_det = lookup_encoding(detected) or detected.lower()

356

357 if norm_exp == norm_det:

358 return True

359

360 try:

361 text_exp = data.decode(norm_exp)

362 text_det = data.decode(norm_det)

363 except (UnicodeDecodeError, LookupError):

364 return False

365

366 if text_exp == text_det:

367 return True

368

369 if len(text_exp) != len(text_det):

370 return False

371

372 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))