Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

82 statements  

1"""Encoding equivalences and name remapping. 

2 

3This module defines: 

4 

51. **Directional supersets** for accuracy evaluation: detecting a superset 

6 encoding when the expected encoding is a subset is correct (e.g., detecting 

7 UTF-8 when expected is ASCII), but not the reverse. 

8 

92. **Bidirectional equivalents**: groups of encodings where detecting any 

10 member when another member was expected is considered correct. This 

11 includes UTF-16/UTF-32 endian variants (which encode the same text with 

12 different byte order) and ISO-2022-JP branch variants (which are 

13 compatible extensions of the same base encoding). 

14 

153. **Preferred superset mapping** for the ``prefer_superset`` API option: 

16 replaces detected ISO/subset encoding names with their Windows/CP superset 

17 equivalents that modern software actually uses. 

18 

194. **Compatibility names** for the default ``compat_names=True`` mode: maps 

20 internal Python codec names to the names chardet 5.x/6.x returned, 

21 preserving backward compatibility for callers that compare encoding 

22 strings directly. 

23""" 

24 

25from __future__ import annotations 

26 

27import unicodedata 

28from collections.abc import Callable 

29 

30from chardet.pipeline import DetectionDict 

31from chardet.registry import lookup_encoding 

32 

33# Directional superset relationships: detecting any of the supersets 

34# when the expected encoding is the subset counts as correct. 

35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii). 

36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8). 

37# 

38# Note: some subset keys (iso-8859-11) are not in the detection 

39# registry — the detector never returns them. They appear here because 

40# chardet test-suite expected values use these names, so the superset 

41# mapping is needed for accuracy evaluation only. 

42SUPERSETS: dict[str, frozenset[str]] = { 

43 "ASCII": frozenset({"utf-8", "cp1252"}), 

44 "TIS-620": frozenset({"iso8859-11", "cp874"}), 

45 "ISO-8859-11": frozenset({"cp874"}), 

46 "GB2312": frozenset({"gb18030"}), 

47 "GBK": frozenset({"gb18030"}), 

48 "Big5": frozenset({"big5hkscs", "cp950"}), 

49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}), 

50 "Shift-JISX0213": frozenset({"shift_jis_2004"}), 

51 "EUC-JP": frozenset({"euc_jis_2004"}), 

52 "EUC-JISX0213": frozenset({"euc_jis_2004"}), 

53 "EUC-KR": frozenset({"cp949"}), 

54 "CP037": frozenset({"cp1140"}), 

55 # ISO-2022-JP subsets: any branch variant is acceptable. 

56 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between 

57 # "ISO" and "2022") because they appear as expected values in the test suite, 

58 # not as canonical chardet output. They are consumed through 

59 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup(). 

60 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}), 

61 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}), 

62 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}), 

63 # ISO/Windows superset pairs 

64 "ISO-8859-1": frozenset({"cp1252"}), 

65 "ISO-8859-2": frozenset({"cp1250"}), 

66 "ISO-8859-5": frozenset({"cp1251"}), 

67 "ISO-8859-6": frozenset({"cp1256"}), 

68 "ISO-8859-7": frozenset({"cp1253"}), 

69 "ISO-8859-8": frozenset({"cp1255"}), 

70 "ISO-8859-9": frozenset({"cp1254"}), 

71 "ISO-8859-13": frozenset({"cp1257"}), 

72} 

73 

74# Preferred superset name for each encoding, used by the ``should_rename_legacy`` 

75# API option. When enabled, detected encoding names are replaced with the 

76# Windows/CP superset that modern software actually uses (browsers, editors, 

77# etc. treat these ISO subsets as their Windows counterparts). 

78# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output. 

79PREFERRED_SUPERSET: dict[str, str] = { 

80 "ascii": "cp1252", 

81 "euc_kr": "cp949", 

82 "iso8859-1": "cp1252", 

83 "iso8859-2": "cp1250", 

84 "iso8859-5": "cp1251", 

85 "iso8859-6": "cp1256", 

86 "iso8859-7": "cp1253", 

87 "iso8859-8": "cp1255", 

88 "iso8859-9": "cp1254", 

89 "iso8859-11": "cp874", 

90 "iso8859-13": "cp1257", 

91 "tis-620": "cp874", 

92} 

93 

94 

95def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict: 

96 """Replace the encoding name using *mapping*, modifying *result* in-place.""" 

97 enc = result.get("encoding") 

98 if isinstance(enc, str): 

99 result["encoding"] = mapping.get(enc, enc) 

100 return result 

101 

102 

103def apply_preferred_superset( 

104 result: DetectionDict, 

105) -> DetectionDict: 

106 """Replace the encoding name with its preferred Windows/CP superset. 

107 

108 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

109 for fluent chaining. 

110 

111 :param result: A detection result dict containing an ``"encoding"`` key. 

112 :returns: The same *result* dict, modified in-place. 

113 """ 

114 return _remap_encoding(result, PREFERRED_SUPERSET) 

115 

116 

117# Deprecated alias — kept for external consumers. 

118apply_legacy_rename = apply_preferred_superset 

119 

120 

121# Mapping from Python codec names to chardet 5.x/6.x compatible display names. 

122# Only entries where codec name differs from the compat output are listed. 

123# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and 

124# encodings new to v7 have no entry — the codec name passes through unchanged. 

125_COMPAT_NAMES: dict[str, str] = { 

126 # 5.x compat — these encodings existed in chardet 5.x with different names 

127 "big5hkscs": "Big5", 

128 "cp855": "IBM855", 

129 "cp866": "IBM866", 

130 "cp949": "CP949", 

131 "euc_jis_2004": "EUC-JP", 

132 "euc_kr": "EUC-KR", 

133 "gb18030": "GB18030", 

134 "hz": "HZ-GB-2312", 

135 "iso2022_jp_2": "ISO-2022-JP", 

136 "iso2022_kr": "ISO-2022-KR", 

137 "iso8859-1": "ISO-8859-1", 

138 "iso8859-5": "ISO-8859-5", 

139 "iso8859-7": "ISO-8859-7", 

140 "iso8859-8": "ISO-8859-8", 

141 "iso8859-9": "ISO-8859-9", 

142 "johab": "Johab", 

143 "koi8-r": "KOI8-R", 

144 "mac-cyrillic": "MacCyrillic", 

145 "mac-roman": "MacRoman", 

146 "shift_jis_2004": "SHIFT_JIS", 

147 "tis-620": "TIS-620", 

148 "utf-16": "UTF-16", 

149 "utf-32": "UTF-32", 

150 "utf-8-sig": "UTF-8-SIG", 

151 "cp1251": "Windows-1251", 

152 "cp1252": "Windows-1252", 

153 "cp1253": "Windows-1253", 

154 "cp1254": "Windows-1254", 

155 "cp1255": "Windows-1255", 

156 # 6.x compat — new in chardet 6.x with different names 

157 "kz1048": "KZ1048", 

158 "mac-greek": "MacGreek", 

159 "mac-iceland": "MacIceland", 

160 "mac-latin2": "MacLatin2", 

161 "mac-turkish": "MacTurkish", 

162} 

163 

164# Backward compat alias 

165_LEGACY_NAMES = _COMPAT_NAMES 

166 

167 

168def apply_compat_names( 

169 result: DetectionDict, 

170) -> DetectionDict: 

171 """Convert internal codec names to chardet 5.x/6.x compatible names. 

172 

173 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

174 for fluent chaining. 

175 

176 :param result: A detection result dict containing an ``"encoding"`` key. 

177 :returns: The same *result* dict, modified in-place. 

178 """ 

179 return _remap_encoding(result, _COMPAT_NAMES) 

180 

181 

182# Bidirectional equivalents -- groups where any member is acceptable for any other. 

183BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = ( 

184 ("utf-16", "utf-16-le", "utf-16-be"), 

185 ("utf-32", "utf-32-le", "utf-32-be"), 

186 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"), 

187) 

188 

189# Bidirectional language equivalences — groups of ISO 639-1 codes for 

190# languages that are nearly indistinguishable by statistical detection. 

191# Detecting any member when another member of the same group was expected 

192# is considered acceptable. 

193LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = ( 

194 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility 

195 ( 

196 "uk", 

197 "ru", 

198 "bg", 

199 "be", 

200 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap 

201 ("ms", "id"), # Malay / Indonesian — standardized variants of one language 

202 ( 

203 "no", 

204 "da", 

205 "sv", 

206 ), # Scandinavian — mutual intelligibility across the dialect continuum 

207) 

208 

209 

210def _build_group_index( 

211 groups: tuple[tuple[str, ...], ...], 

212 normalize: Callable[[str], str] = lambda x: x, 

213) -> dict[str, frozenset[str]]: 

214 """Build a lookup: key -> frozenset of all equivalent keys in the same group.""" 

215 result: dict[str, frozenset[str]] = {} 

216 for group in groups: 

217 normed = frozenset(normalize(n) for n in group) 

218 for name in group: 

219 result[normalize(name)] = normed 

220 return result 

221 

222 

223_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES) 

224 

225 

226def is_language_equivalent(expected: str, detected: str) -> bool: 

227 """Check whether *detected* is an acceptable language for *expected*. 

228 

229 Returns ``True`` when *expected* and *detected* are the same ISO 639-1 

230 code, or belong to the same equivalence group in 

231 :data:`LANGUAGE_EQUIVALENCES`. 

232 

233 :param expected: Expected ISO 639-1 language code. 

234 :param detected: Detected ISO 639-1 language code. 

235 :returns: ``True`` if the languages are equivalent. 

236 """ 

237 if expected == detected: 

238 return True 

239 group = _LANGUAGE_EQUIV.get(expected) 

240 return group is not None and detected in group 

241 

242 

243# Pre-built normalized lookups for fast comparison. 

244# Built iteratively because multiple SUPERSETS keys can normalize to the same 

245# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004). 

246# Values are merged (unioned) when keys collide. 

247_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {} 

248for _subset, _supersets in SUPERSETS.items(): 

249 _key = lookup_encoding(_subset) or _subset 

250 _normed = frozenset(lookup_encoding(s) or s for s in _supersets) 

251 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed 

252 

253 

254_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index( 

255 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n 

256) 

257 

258 

259def is_correct(expected: str | None, detected: str | None) -> bool: 

260 """Check whether *detected* is an acceptable answer for *expected*. 

261 

262 Acceptable means: 

263 

264 1. Exact match (after normalization), OR 

265 2. Both belong to the same bidirectional byte-order group, OR 

266 3. *detected* is a known superset of *expected*. 

267 

268 :param expected: The expected encoding name, or ``None`` for binary files. 

269 :param detected: The detected encoding name, or ``None``. 

270 :returns: ``True`` if the detection is acceptable. 

271 """ 

272 if expected is None: 

273 return detected is None 

274 if detected is None: 

275 return False 

276 norm_exp = lookup_encoding(expected) or expected.lower() 

277 norm_det = lookup_encoding(detected) or detected.lower() 

278 

279 # 1. Exact match 

280 if norm_exp == norm_det: 

281 return True 

282 

283 # 2. Bidirectional (same byte-order group) 

284 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]: 

285 return True 

286 

287 # 3. Superset is acceptable (detected is a known superset of expected) 

288 return ( 

289 norm_exp in _NORMALIZED_SUPERSETS 

290 and norm_det in _NORMALIZED_SUPERSETS[norm_exp] 

291 ) 

292 

293 

294def _strip_combining(text: str) -> str: 

295 """NFKD-normalize *text* and strip all combining marks.""" 

296 nfkd = unicodedata.normalize("NFKD", text) 

297 return "".join(c for c in nfkd if not unicodedata.combining(c)) 

298 

299 

300# Pre-computed symbol pair lookups for O(1) equivalence checks. 

301# Both orderings are stored to avoid constructing temporaries per call. 

302_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset( 

303 { 

304 ("¤", "€"), 

305 ("€", "¤"), 

306 } 

307) 

308 

309 

310def _chars_equivalent(a: str, b: str) -> bool: 

311 """Return True if characters *a* and *b* are functionally equivalent. 

312 

313 Equivalent means: 

314 - Same character, OR 

315 - Same base letter after stripping combining marks, OR 

316 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €) 

317 """ 

318 if a == b: 

319 return True 

320 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS: 

321 return True 

322 # Compare base letters after stripping combining marks. 

323 return _strip_combining(a) == _strip_combining(b) 

324 

325 

326def is_equivalent_detection( 

327 data: bytes, expected: str | None, detected: str | None 

328) -> bool: 

329 """Check whether *detected* produces functionally identical text to *expected*. 

330 

331 Returns ``True`` when: 

332 

333 1. *detected* is not ``None`` and both encoding names normalize to the same 

334 codec, OR 

335 2. Decoding *data* with both encodings yields identical strings, OR 

336 3. Every differing character pair is functionally equivalent: same base 

337 letter after stripping combining marks, or an explicitly listed symbol 

338 equivalence (e.g. ¤ ↔ €). 

339 

340 Returns ``False`` if *detected* is ``None``, either encoding is unknown, 

341 or either encoding cannot decode *data*. 

342 

343 :param data: The raw byte data that was detected. 

344 :param expected: The expected encoding name, or ``None`` for binary files. 

345 :param detected: The detected encoding name, or ``None``. 

346 :returns: ``True`` if decoding with *detected* yields functionally identical 

347 text to decoding with *expected*. 

348 """ 

349 if expected is None: 

350 return detected is None 

351 if detected is None: 

352 return False 

353 

354 norm_exp = lookup_encoding(expected) or expected.lower() 

355 norm_det = lookup_encoding(detected) or detected.lower() 

356 

357 if norm_exp == norm_det: 

358 return True 

359 

360 try: 

361 text_exp = data.decode(norm_exp) 

362 text_det = data.decode(norm_det) 

363 except (UnicodeDecodeError, LookupError): 

364 return False 

365 

366 if text_exp == text_det: 

367 return True 

368 

369 if len(text_exp) != len(text_det): 

370 return False 

371 

372 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))