Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/equivalences.py: 49%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

81 statements  

1"""Encoding equivalences and name remapping. 

2 

3This module defines: 

4 

51. **Directional supersets** for accuracy evaluation: detecting a superset 

6 encoding when the expected encoding is a subset is correct (e.g., detecting 

7 UTF-8 when expected is ASCII), but not the reverse. 

8 

92. **Bidirectional equivalents**: groups of encodings where detecting any 

10 member when another member was expected is considered correct. This 

11 includes UTF-16/UTF-32 endian variants (which encode the same text with 

12 different byte order) and ISO-2022-JP branch variants (which are 

13 compatible extensions of the same base encoding). 

14 

153. **Preferred superset mapping** for the ``prefer_superset`` API option: 

16 replaces detected ISO/subset encoding names with their Windows/CP superset 

17 equivalents that modern software actually uses. 

18 

194. **Compatibility names** for the default ``compat_names=True`` mode: maps 

20 internal Python codec names to the names chardet 5.x/6.x returned, 

21 preserving backward compatibility for callers that compare encoding 

22 strings directly. 

23""" 

24 

25from __future__ import annotations 

26 

27import unicodedata 

28from collections.abc import Callable 

29 

30from chardet.pipeline import DetectionDict 

31from chardet.registry import lookup_encoding 

32 

33# Directional superset relationships: detecting any of the supersets 

34# when the expected encoding is the subset counts as correct. 

35# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii). 

36# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8). 

37# 

38# Note: some subset keys (iso-8859-11) are not in the detection 

39# registry — the detector never returns them. They appear here because 

40# chardet test-suite expected values use these names, so the superset 

41# mapping is needed for accuracy evaluation only. 

42SUPERSETS: dict[str, frozenset[str]] = { 

43 "ASCII": frozenset({"utf-8", "cp1252"}), 

44 "TIS-620": frozenset({"iso8859-11", "cp874"}), 

45 "ISO-8859-11": frozenset({"cp874"}), 

46 "GB2312": frozenset({"gb18030"}), 

47 "GBK": frozenset({"gb18030"}), 

48 "Big5": frozenset({"big5hkscs", "cp950"}), 

49 "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}), 

50 "Shift-JISX0213": frozenset({"shift_jis_2004"}), 

51 "EUC-JP": frozenset({"euc_jis_2004"}), 

52 "EUC-JISX0213": frozenset({"euc_jis_2004"}), 

53 "EUC-KR": frozenset({"cp949"}), 

54 "CP037": frozenset({"cp1140"}), 

55 # ISO-2022-JP subsets: any branch variant is acceptable. 

56 # In our registry, base ISO-2022-JP is an alias of iso2022_jp_2, so all 

57 # three extended variants are supersets of the same base. While the 

58 # extended variants use different escape sequences for non-basic characters, 

59 # real-world files rarely use those extensions — the base JIS X 0208 

60 # character set is shared by all variants and cross-decodes identically. 

61 # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between 

62 # "ISO" and "2022") because they appear as expected values in the test suite, 

63 # not as canonical chardet output. They are consumed through 

64 # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup(). 

65 "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}), 

66 "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}), 

67 "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}), 

68 # ISO/Windows superset pairs 

69 "ISO-8859-1": frozenset({"cp1252"}), 

70 "ISO-8859-2": frozenset({"cp1250"}), 

71 "ISO-8859-5": frozenset({"cp1251"}), 

72 "ISO-8859-6": frozenset({"cp1256"}), 

73 "ISO-8859-7": frozenset({"cp1253"}), 

74 "ISO-8859-8": frozenset({"cp1255"}), 

75 "ISO-8859-9": frozenset({"cp1254"}), 

76 "ISO-8859-13": frozenset({"cp1257"}), 

77 # UTF-16/32: bare form (BOM-aware) is interchangeable with either endianness, 

78 # but LE and BE are NOT interchangeable with each other. 

79 "UTF-16": frozenset({"utf-16-le", "utf-16-be"}), 

80 "UTF-16-LE": frozenset({"utf-16"}), 

81 "UTF-16-BE": frozenset({"utf-16"}), 

82 "UTF-32": frozenset({"utf-32-le", "utf-32-be"}), 

83 "UTF-32-LE": frozenset({"utf-32"}), 

84 "UTF-32-BE": frozenset({"utf-32"}), 

85} 

86 

87# Preferred superset name for each encoding, used by the ``should_rename_legacy`` 

88# API option. When enabled, detected encoding names are replaced with the 

89# Windows/CP superset that modern software actually uses (browsers, editors, 

90# etc. treat these ISO subsets as their Windows counterparts). 

91# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output. 

92PREFERRED_SUPERSET: dict[str, str] = { 

93 "ascii": "cp1252", 

94 "euc_kr": "cp949", 

95 "iso8859-1": "cp1252", 

96 "iso8859-2": "cp1250", 

97 "iso8859-5": "cp1251", 

98 "iso8859-6": "cp1256", 

99 "iso8859-7": "cp1253", 

100 "iso8859-8": "cp1255", 

101 "iso8859-9": "cp1254", 

102 "iso8859-11": "cp874", 

103 "iso8859-13": "cp1257", 

104 "tis-620": "cp874", 

105} 

106 

107 

108def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict: 

109 """Replace the encoding name using *mapping*, modifying *result* in-place.""" 

110 enc = result.get("encoding") 

111 if isinstance(enc, str): 

112 result["encoding"] = mapping.get(enc, enc) 

113 return result 

114 

115 

116def apply_preferred_superset( 

117 result: DetectionDict, 

118) -> DetectionDict: 

119 """Replace the encoding name with its preferred Windows/CP superset. 

120 

121 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

122 for fluent chaining. 

123 

124 :param result: A detection result dict containing an ``"encoding"`` key. 

125 :returns: The same *result* dict, modified in-place. 

126 """ 

127 return _remap_encoding(result, PREFERRED_SUPERSET) 

128 

129 

130# Deprecated alias — kept for external consumers. 

131apply_legacy_rename = apply_preferred_superset 

132 

133 

134# Mapping from Python codec names to chardet 5.x/6.x compatible display names. 

135# Only entries where codec name differs from the compat output are listed. 

136# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and 

137# encodings new to v7 have no entry — the codec name passes through unchanged. 

138_COMPAT_NAMES: dict[str, str] = { 

139 # 5.x compat — these encodings existed in chardet 5.x with different names 

140 "big5hkscs": "Big5", 

141 "cp855": "IBM855", 

142 "cp866": "IBM866", 

143 "cp949": "CP949", 

144 "euc_jis_2004": "EUC-JP", 

145 "euc_kr": "EUC-KR", 

146 "gb18030": "GB18030", 

147 "hz": "HZ-GB-2312", 

148 "iso2022_jp_2": "ISO-2022-JP", 

149 "iso2022_kr": "ISO-2022-KR", 

150 "iso8859-1": "ISO-8859-1", 

151 "iso8859-5": "ISO-8859-5", 

152 "iso8859-7": "ISO-8859-7", 

153 "iso8859-8": "ISO-8859-8", 

154 "iso8859-9": "ISO-8859-9", 

155 "johab": "Johab", 

156 "koi8-r": "KOI8-R", 

157 "mac-cyrillic": "MacCyrillic", 

158 "mac-roman": "MacRoman", 

159 "shift_jis_2004": "SHIFT_JIS", 

160 "tis-620": "TIS-620", 

161 "utf-16": "UTF-16", 

162 "utf-32": "UTF-32", 

163 "utf-8-sig": "UTF-8-SIG", 

164 "cp1251": "Windows-1251", 

165 "cp1252": "Windows-1252", 

166 "cp1253": "Windows-1253", 

167 "cp1254": "Windows-1254", 

168 "cp1255": "Windows-1255", 

169 # 6.x compat — new in chardet 6.x with different names 

170 "kz1048": "KZ1048", 

171 "mac-greek": "MacGreek", 

172 "mac-iceland": "MacIceland", 

173 "mac-latin2": "MacLatin2", 

174 "mac-turkish": "MacTurkish", 

175} 

176 

177 

178def apply_compat_names( 

179 result: DetectionDict, 

180) -> DetectionDict: 

181 """Convert internal codec names to chardet 5.x/6.x compatible names. 

182 

183 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

184 for fluent chaining. 

185 

186 :param result: A detection result dict containing an ``"encoding"`` key. 

187 :returns: The same *result* dict, modified in-place. 

188 """ 

189 return _remap_encoding(result, _COMPAT_NAMES) 

190 

191 

192# Bidirectional equivalents -- groups where any member is acceptable for any other. 

193# Bidirectional equivalents -- groups where any member is acceptable for any other. 

194# 

195# NOTE: UTF-16/32 endianness is handled via directional SUPERSETS instead, 

196# because wrong endianness garbles text. ISO-2022-JP variants remain here 

197# because base ISO-2022-JP is an alias of iso2022_jp_2 in our registry, so 

198# the SUPERSETS entries already make all variants interchangeable via the 

199# shared base. 

200BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = ( 

201 ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"), 

202) 

203 

204# Bidirectional language equivalences — groups of ISO 639-1 codes for 

205# languages that are nearly indistinguishable by statistical detection. 

206# Detecting any member when another member of the same group was expected 

207# is considered acceptable. 

208LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = ( 

209 ("sk", "cs"), # Slovak / Czech — ~85% mutual intelligibility 

210 ( 

211 "uk", 

212 "ru", 

213 "bg", 

214 "be", 

215 ), # East Slavic + Bulgarian — shared Cyrillic, high written overlap 

216 ("ms", "id"), # Malay / Indonesian — standardized variants of one language 

217 ( 

218 "no", 

219 "da", 

220 "sv", 

221 ), # Scandinavian — mutual intelligibility across the dialect continuum 

222) 

223 

224 

225def _build_group_index( 

226 groups: tuple[tuple[str, ...], ...], 

227 normalize: Callable[[str], str] = lambda x: x, 

228) -> dict[str, frozenset[str]]: 

229 """Build a lookup: key -> frozenset of all equivalent keys in the same group.""" 

230 result: dict[str, frozenset[str]] = {} 

231 for group in groups: 

232 normed = frozenset(normalize(n) for n in group) 

233 for name in group: 

234 result[normalize(name)] = normed 

235 return result 

236 

237 

238_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES) 

239 

240 

241def is_language_equivalent(expected: str, detected: str) -> bool: 

242 """Check whether *detected* is an acceptable language for *expected*. 

243 

244 Returns ``True`` when *expected* and *detected* are the same ISO 639-1 

245 code, or belong to the same equivalence group in 

246 :data:`LANGUAGE_EQUIVALENCES`. 

247 

248 :param expected: Expected ISO 639-1 language code. 

249 :param detected: Detected ISO 639-1 language code. 

250 :returns: ``True`` if the languages are equivalent. 

251 """ 

252 if expected == detected: 

253 return True 

254 group = _LANGUAGE_EQUIV.get(expected) 

255 return group is not None and detected in group 

256 

257 

258# Pre-built normalized lookups for fast comparison. 

259# Built iteratively because multiple SUPERSETS keys can normalize to the same 

260# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004). 

261# Values are merged (unioned) when keys collide. 

262_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {} 

263for _subset, _supersets in SUPERSETS.items(): 

264 _key = lookup_encoding(_subset) or _subset 

265 _normed = frozenset(lookup_encoding(s) or s for s in _supersets) 

266 _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed 

267 

268 

269_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index( 

270 BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n 

271) 

272 

273 

274def is_correct(expected: str | None, detected: str | None) -> bool: 

275 """Check whether *detected* is an acceptable answer for *expected*. 

276 

277 Acceptable means: 

278 

279 1. Exact match (after normalization), OR 

280 2. Both belong to the same bidirectional byte-order group, OR 

281 3. *detected* is a known superset of *expected*. 

282 

283 :param expected: The expected encoding name, or ``None`` for binary files. 

284 :param detected: The detected encoding name, or ``None``. 

285 :returns: ``True`` if the detection is acceptable. 

286 """ 

287 if expected is None: 

288 return detected is None 

289 if detected is None: 

290 return False 

291 norm_exp = lookup_encoding(expected) or expected.lower() 

292 norm_det = lookup_encoding(detected) or detected.lower() 

293 

294 # 1. Exact match 

295 if norm_exp == norm_det: 

296 return True 

297 

298 # 2. Bidirectional (same byte-order group) 

299 if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]: 

300 return True 

301 

302 # 3. Superset is acceptable (detected is a known superset of expected) 

303 return ( 

304 norm_exp in _NORMALIZED_SUPERSETS 

305 and norm_det in _NORMALIZED_SUPERSETS[norm_exp] 

306 ) 

307 

308 

309def _strip_combining(text: str) -> str: 

310 """NFKD-normalize *text* and strip all combining marks.""" 

311 nfkd = unicodedata.normalize("NFKD", text) 

312 return "".join(c for c in nfkd if not unicodedata.combining(c)) 

313 

314 

315# Pre-computed symbol pair lookups for O(1) equivalence checks. 

316# Both orderings are stored to avoid constructing temporaries per call. 

317_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset( 

318 { 

319 ("¤", "€"), 

320 ("€", "¤"), 

321 } 

322) 

323 

324 

325def _chars_equivalent(a: str, b: str) -> bool: 

326 """Return True if characters *a* and *b* are functionally equivalent. 

327 

328 Equivalent means: 

329 - Same character, OR 

330 - Same base letter after stripping combining marks, OR 

331 - An explicitly listed symbol equivalence (e.g. ¤ ↔ €) 

332 """ 

333 if a == b: 

334 return True 

335 if (a, b) in _EQUIVALENT_SYMBOL_PAIRS: 

336 return True 

337 # Compare base letters after stripping combining marks. 

338 return _strip_combining(a) == _strip_combining(b) 

339 

340 

341def is_equivalent_detection( 

342 data: bytes, expected: str | None, detected: str | None 

343) -> bool: 

344 """Check whether *detected* produces functionally identical text to *expected*. 

345 

346 Returns ``True`` when: 

347 

348 1. *detected* is not ``None`` and both encoding names normalize to the same 

349 codec, OR 

350 2. Decoding *data* with both encodings yields identical strings, OR 

351 3. Every differing character pair is functionally equivalent: same base 

352 letter after stripping combining marks, or an explicitly listed symbol 

353 equivalence (e.g. ¤ ↔ €). 

354 

355 Returns ``False`` if *detected* is ``None``, either encoding is unknown, 

356 or either encoding cannot decode *data*. 

357 

358 :param data: The raw byte data that was detected. 

359 :param expected: The expected encoding name, or ``None`` for binary files. 

360 :param detected: The detected encoding name, or ``None``. 

361 :returns: ``True`` if decoding with *detected* yields functionally identical 

362 text to decoding with *expected*. 

363 """ 

364 if expected is None: 

365 return detected is None 

366 if detected is None: 

367 return False 

368 

369 norm_exp = lookup_encoding(expected) or expected.lower() 

370 norm_det = lookup_encoding(detected) or detected.lower() 

371 

372 if norm_exp == norm_det: 

373 return True 

374 

375 try: 

376 text_exp = data.decode(norm_exp) 

377 text_det = data.decode(norm_det) 

378 except (UnicodeDecodeError, LookupError): 

379 return False 

380 

381 if text_exp == text_det: 

382 return True 

383 

384 if len(text_exp) != len(text_det): 

385 return False 

386 

387 return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))