Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 49%

1"""Encoding equivalences and name remapping.

3This module defines:

51. **Directional supersets** for accuracy evaluation: detecting a superset

6 encoding when the expected encoding is a subset is correct (e.g., detecting

7 UTF-8 when expected is ASCII), but not the reverse.

92. **Bidirectional equivalents**: groups of encodings where detecting any

10 member when another member was expected is considered correct. This

11 includes UTF-16/UTF-32 endian variants (which encode the same text with

12 different byte order) and ISO-2022-JP branch variants (which are

13 compatible extensions of the same base encoding).

153. **Preferred superset mapping** for the ``prefer_superset`` API option:

16 replaces detected ISO/subset encoding names with their Windows/CP superset

17 equivalents that modern software actually uses.

194. **Compatibility names** for the default ``compat_names=True`` mode: maps

20 internal Python codec names to the names chardet 5.x/6.x returned,

21 preserving backward compatibility for callers that compare encoding

22 strings directly.

23"""

25from __future__ import annotations

27import unicodedata

28from collections.abc import Callable

30from chardet.pipeline import DetectionDict

31from chardet.registry import lookup_encoding

33# Directional superset relationships: detecting any of the supersets

34# when the expected encoding is the subset counts as correct.

35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii).

36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8).

37#

38# Note: some subset keys (iso-8859-11) are not in the detection

39# registry — the detector never returns them. They appear here because

40# chardet test-suite expected values use these names, so the superset

41# mapping is needed for accuracy evaluation only.

42SUPERSETS: dict[str, frozenset[str]] = {

43 "ASCII": frozenset({"utf-8", "cp1252"}),

44 "TIS-620": frozenset({"iso8859-11", "cp874"}),

45 "ISO-8859-11": frozenset({"cp874"}),

46 "GB2312": frozenset({"gb18030"}),

47 "GBK": frozenset({"gb18030"}),

48 "Big5": frozenset({"big5hkscs", "cp950"}),

49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}),

50 "Shift-JISX0213": frozenset({"shift_jis_2004"}),

51 "EUC-JP": frozenset({"euc_jis_2004"}),

52 "EUC-JISX0213": frozenset({"euc_jis_2004"}),

53 "EUC-KR": frozenset({"cp949"}),

54 "CP037": frozenset({"cp1140"}),

55 # ISO-2022-JP subsets: any branch variant is acceptable.

56 # In our registry, base ISO-2022-JP is an alias of iso2022_jp_2, so all

57 # three extended variants are supersets of the same base. While the

58 # extended variants use different escape sequences for non-basic characters,

59 # real-world files rarely use those extensions — the base JIS X 0208

60 # character set is shared by all variants and cross-decodes identically.

61 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between

62 # "ISO" and "2022") because they appear as expected values in the test suite,

63 # not as canonical chardet output. They are consumed through

64 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup().

65 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}),

66 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}),

67 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}),

68 # ISO/Windows superset pairs

69 "ISO-8859-1": frozenset({"cp1252"}),

70 "ISO-8859-2": frozenset({"cp1250"}),

71 "ISO-8859-5": frozenset({"cp1251"}),

72 "ISO-8859-6": frozenset({"cp1256"}),

73 "ISO-8859-7": frozenset({"cp1253"}),

74 "ISO-8859-8": frozenset({"cp1255"}),

75 "ISO-8859-9": frozenset({"cp1254"}),

76 "ISO-8859-13": frozenset({"cp1257"}),

77 # UTF-16/32: bare form (BOM-aware) is interchangeable with either endianness,

78 # but LE and BE are NOT interchangeable with each other.

79 "UTF-16": frozenset({"utf-16-le", "utf-16-be"}),

80 "UTF-16-LE": frozenset({"utf-16"}),

81 "UTF-16-BE": frozenset({"utf-16"}),

82 "UTF-32": frozenset({"utf-32-le", "utf-32-be"}),

83 "UTF-32-LE": frozenset({"utf-32"}),

84 "UTF-32-BE": frozenset({"utf-32"}),

85}

87# Preferred superset name for each encoding, used by the ``should_rename_legacy``

88# API option. When enabled, detected encoding names are replaced with the

89# Windows/CP superset that modern software actually uses (browsers, editors,

90# etc. treat these ISO subsets as their Windows counterparts).

91# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.

92PREFERRED_SUPERSET: dict[str, str] = {

93 "ascii": "cp1252",

94 "euc_kr": "cp949",

95 "iso8859-1": "cp1252",

96 "iso8859-2": "cp1250",

97 "iso8859-5": "cp1251",

98 "iso8859-6": "cp1256",

99 "iso8859-7": "cp1253",

100 "iso8859-8": "cp1255",

101 "iso8859-9": "cp1254",

102 "iso8859-11": "cp874",

103 "iso8859-13": "cp1257",

104 "tis-620": "cp874",

105}

106

107

108def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:

109 """Replace the encoding name using *mapping*, modifying *result* in-place."""

110 enc = result.get("encoding")

111 if isinstance(enc, str):

112 result["encoding"] = mapping.get(enc, enc)

113 return result

114

115

116def apply_preferred_superset(

117 result: DetectionDict,

118) -> DetectionDict:

119 """Replace the encoding name with its preferred Windows/CP superset.

120

121 Modifies the ``"encoding"`` value in *result* in-place and returns *result*

122 for fluent chaining.

123

124 :param result: A detection result dict containing an ``"encoding"`` key.

125 :returns: The same *result* dict, modified in-place.

126 """

127 return _remap_encoding(result, PREFERRED_SUPERSET)

128

129

130# Deprecated alias — kept for external consumers.

131apply_legacy_rename = apply_preferred_superset

132

133

134# Mapping from Python codec names to chardet 5.x/6.x compatible display names.

135# Only entries where codec name differs from the compat output are listed.

136# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and

137# encodings new to v7 have no entry — the codec name passes through unchanged.

138_COMPAT_NAMES: dict[str, str] = {

139 # 5.x compat — these encodings existed in chardet 5.x with different names

140 "big5hkscs": "Big5",

141 "cp855": "IBM855",

142 "cp866": "IBM866",

143 "cp949": "CP949",

144 "euc_jis_2004": "EUC-JP",

145 "euc_kr": "EUC-KR",

146 "gb18030": "GB18030",

147 "hz": "HZ-GB-2312",

148 "iso2022_jp_2": "ISO-2022-JP",

149 "iso2022_kr": "ISO-2022-KR",

150 "iso8859-1": "ISO-8859-1",

151 "iso8859-5": "ISO-8859-5",

152 "iso8859-7": "ISO-8859-7",

153 "iso8859-8": "ISO-8859-8",

154 "iso8859-9": "ISO-8859-9",

155 "johab": "Johab",

156 "koi8-r": "KOI8-R",

157 "mac-cyrillic": "MacCyrillic",

158 "mac-roman": "MacRoman",

159 "shift_jis_2004": "SHIFT_JIS",

160 "tis-620": "TIS-620",

161 "utf-16": "UTF-16",

162 "utf-32": "UTF-32",

163 "utf-8-sig": "UTF-8-SIG",

164 "cp1251": "Windows-1251",

165 "cp1252": "Windows-1252",

166 "cp1253": "Windows-1253",

167 "cp1254": "Windows-1254",

168 "cp1255": "Windows-1255",

169 # 6.x compat — new in chardet 6.x with different names

170 "kz1048": "KZ1048",

171 "mac-greek": "MacGreek",

172 "mac-iceland": "MacIceland",

173 "mac-latin2": "MacLatin2",

174 "mac-turkish": "MacTurkish",

175}

176

177

178def apply_compat_names(

179 result: DetectionDict,

180) -> DetectionDict:

181 """Convert internal codec names to chardet 5.x/6.x compatible names.

182

183 Modifies the ``"encoding"`` value in *result* in-place and returns *result*

184 for fluent chaining.

185

186 :param result: A detection result dict containing an ``"encoding"`` key.

187 :returns: The same *result* dict, modified in-place.

188 """

189 return _remap_encoding(result, _COMPAT_NAMES)

190

191

192# Bidirectional equivalents -- groups where any member is acceptable for any other.

193# Bidirectional equivalents -- groups where any member is acceptable for any other.

194#

195# NOTE: UTF-16/32 endianness is handled via directional SUPERSETS instead,

196# because wrong endianness garbles text. ISO-2022-JP variants remain here

197# because base ISO-2022-JP is an alias of iso2022_jp_2 in our registry, so

198# the SUPERSETS entries already make all variants interchangeable via the

199# shared base.

200BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = (

201 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"),

202)

203

204# Bidirectional language equivalences — groups of ISO 639-1 codes for

205# languages that are nearly indistinguishable by statistical detection.

206# Detecting any member when another member of the same group was expected

207# is considered acceptable.

208LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = (

209 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility

210 (

211 "uk",

212 "ru",

213 "bg",

214 "be",

215 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap

216 ("ms", "id"), # Malay / Indonesian — standardized variants of one language

217 (

218 "no",

219 "da",

220 "sv",

221 ), # Scandinavian — mutual intelligibility across the dialect continuum

222)

223

224

225def _build_group_index(

226 groups: tuple[tuple[str, ...], ...],

227 normalize: Callable[[str], str] = lambda x: x,

228) -> dict[str, frozenset[str]]:

229 """Build a lookup: key -> frozenset of all equivalent keys in the same group."""

230 result: dict[str, frozenset[str]] = {}

231 for group in groups:

232 normed = frozenset(normalize(n) for n in group)

233 for name in group:

234 result[normalize(name)] = normed

235 return result

236

237

238_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES)

239

240

241def is_language_equivalent(expected: str, detected: str) -> bool:

242 """Check whether *detected* is an acceptable language for *expected*.

243

244 Returns ``True`` when *expected* and *detected* are the same ISO 639-1

245 code, or belong to the same equivalence group in

246 :data:`LANGUAGE_EQUIVALENCES`.

247

248 :param expected: Expected ISO 639-1 language code.

249 :param detected: Detected ISO 639-1 language code.

250 :returns: ``True`` if the languages are equivalent.

251 """

252 if expected == detected:

253 return True

254 group = _LANGUAGE_EQUIV.get(expected)

255 return group is not None and detected in group

256

257

258# Pre-built normalized lookups for fast comparison.

259# Built iteratively because multiple SUPERSETS keys can normalize to the same

260# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004).

261# Values are merged (unioned) when keys collide.

262_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {}

263for _subset, _supersets in SUPERSETS.items():

264 _key = lookup_encoding(_subset) or _subset

265 _normed = frozenset(lookup_encoding(s) or s for s in _supersets)

266 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed

267

268

269_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index(

270 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n

271)

272

273

274def is_correct(expected: str | None, detected: str | None) -> bool:

275 """Check whether *detected* is an acceptable answer for *expected*.

276

277 Acceptable means:

278

279 1. Exact match (after normalization), OR

280 2. Both belong to the same bidirectional byte-order group, OR

281 3. *detected* is a known superset of *expected*.

282

283 :param expected: The expected encoding name, or ``None`` for binary files.

284 :param detected: The detected encoding name, or ``None``.

285 :returns: ``True`` if the detection is acceptable.

286 """

287 if expected is None:

288 return detected is None

289 if detected is None:

290 return False

291 norm_exp = lookup_encoding(expected) or expected.lower()

292 norm_det = lookup_encoding(detected) or detected.lower()

293

294 # 1. Exact match

295 if norm_exp == norm_det:

296 return True

297

298 # 2. Bidirectional (same byte-order group)

299 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]:

300 return True

301

302 # 3. Superset is acceptable (detected is a known superset of expected)

303 return (

304 norm_exp in _NORMALIZED_SUPERSETS

305 and norm_det in _NORMALIZED_SUPERSETS[norm_exp]

306 )

307

308

309def _strip_combining(text: str) -> str:

310 """NFKD-normalize *text* and strip all combining marks."""

311 nfkd = unicodedata.normalize("NFKD", text)

312 return "".join(c for c in nfkd if not unicodedata.combining(c))

313

314

315# Pre-computed symbol pair lookups for O(1) equivalence checks.

316# Both orderings are stored to avoid constructing temporaries per call.

317_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset(

318 {

319 ("¤", "€"),

320 ("€", "¤"),

321 }

322)

323

324

325def _chars_equivalent(a: str, b: str) -> bool:

326 """Return True if characters *a* and *b* are functionally equivalent.

327

328 Equivalent means:

329 - Same character, OR

330 - Same base letter after stripping combining marks, OR

331 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)

332 """

333 if a == b:

334 return True

335 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS:

336 return True

337 # Compare base letters after stripping combining marks.

338 return _strip_combining(a) == _strip_combining(b)

339

340

341def is_equivalent_detection(

342 data: bytes, expected: str | None, detected: str | None

343) -> bool:

344 """Check whether *detected* produces functionally identical text to *expected*.

345

346 Returns ``True`` when:

347

348 1. *detected* is not ``None`` and both encoding names normalize to the same

349 codec, OR

350 2. Decoding *data* with both encodings yields identical strings, OR

351 3. Every differing character pair is functionally equivalent: same base

352 letter after stripping combining marks, or an explicitly listed symbol

353 equivalence (e.g. ¤ ↔ €).

354

355 Returns ``False`` if *detected* is ``None``, either encoding is unknown,

356 or either encoding cannot decode *data*.

357

358 :param data: The raw byte data that was detected.

359 :param expected: The expected encoding name, or ``None`` for binary files.

360 :param detected: The detected encoding name, or ``None``.

361 :returns: ``True`` if decoding with *detected* yields functionally identical

362 text to decoding with *expected*.

363 """

364 if expected is None:

365 return detected is None

366 if detected is None:

367 return False

368

369 norm_exp = lookup_encoding(expected) or expected.lower()

370 norm_det = lookup_encoding(detected) or detected.lower()

371

372 if norm_exp == norm_det:

373 return True

374

375 try:

376 text_exp = data.decode(norm_exp)

377 text_det = data.decode(norm_det)

378 except (UnicodeDecodeError, LookupError):

379 return False

380

381 if text_exp == text_det:

382 return True

383

384 if len(text_exp) != len(text_det):

385 return False

386

387 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))