Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/output_names.py: 93%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

15 statements  

1"""Public-API encoding-name remapping. 

2 

3Two output transforms applied to detection results before they cross the 

4public API: 

5 

6* :func:`apply_preferred_superset` -- when the ``prefer_superset`` API option 

7 is enabled, replaces detected ISO/subset encoding names with their 

8 Windows/CP supersets that modern software actually uses 

9 (e.g., ISO-8859-1 -> Windows-1252). 

10 

11* :func:`apply_compat_names` -- when the default ``compat_names=True`` mode 

12 is enabled, maps internal Python codec names to the names chardet 5.x/6.x 

13 returned, preserving backward compatibility for callers that compare 

14 encoding strings directly. 

15 

16Both transforms operate in-place on a :class:`~chardet.pipeline.DetectionDict` 

17and return the same dict for fluent chaining. 

18""" 

19 

20from __future__ import annotations 

21 

22from chardet.pipeline import DetectionDict 

23 

24# Preferred superset name for each encoding, used by the ``prefer_superset`` 

25# API option. When enabled, detected encoding names are replaced with the 

26# Windows/CP superset that modern software actually uses (browsers, editors, 

27# etc. treat these ISO subsets as their Windows counterparts). 

28# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output. 

29PREFERRED_SUPERSET: dict[str, str] = { 

30 "ascii": "cp1252", 

31 "euc_kr": "cp949", 

32 "iso8859-1": "cp1252", 

33 "iso8859-2": "cp1250", 

34 "iso8859-5": "cp1251", 

35 "iso8859-6": "cp1256", 

36 "iso8859-7": "cp1253", 

37 "iso8859-8": "cp1255", 

38 "iso8859-9": "cp1254", 

39 "iso8859-11": "cp874", 

40 "iso8859-13": "cp1257", 

41 "tis-620": "cp874", 

42} 

43 

44 

45# Mapping from Python codec names to chardet 5.x/6.x compatible display names. 

46# Only entries where codec name differs from the compat output are listed. 

47# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and 

48# encodings new to v7 have no entry — the codec name passes through unchanged. 

49_COMPAT_NAMES: dict[str, str] = { 

50 # 5.x compat — these encodings existed in chardet 5.x with different names 

51 "big5hkscs": "Big5", 

52 "cp855": "IBM855", 

53 "cp866": "IBM866", 

54 "cp949": "CP949", 

55 "euc_jis_2004": "EUC-JP", 

56 "euc_kr": "EUC-KR", 

57 "gb18030": "GB18030", 

58 "hz": "HZ-GB-2312", 

59 "iso2022_jp_2": "ISO-2022-JP", 

60 "iso2022_kr": "ISO-2022-KR", 

61 "iso8859-1": "ISO-8859-1", 

62 "iso8859-5": "ISO-8859-5", 

63 "iso8859-7": "ISO-8859-7", 

64 "iso8859-8": "ISO-8859-8", 

65 "iso8859-9": "ISO-8859-9", 

66 "johab": "Johab", 

67 "koi8-r": "KOI8-R", 

68 "mac-cyrillic": "MacCyrillic", 

69 "mac-roman": "MacRoman", 

70 "shift_jis_2004": "SHIFT_JIS", 

71 "tis-620": "TIS-620", 

72 "utf-16": "UTF-16", 

73 "utf-32": "UTF-32", 

74 "utf-8-sig": "UTF-8-SIG", 

75 "cp1251": "Windows-1251", 

76 "cp1252": "Windows-1252", 

77 "cp1253": "Windows-1253", 

78 "cp1254": "Windows-1254", 

79 "cp1255": "Windows-1255", 

80 # 6.x compat — new in chardet 6.x with different names 

81 "kz1048": "KZ1048", 

82 "mac-greek": "MacGreek", 

83 "mac-iceland": "MacIceland", 

84 "mac-latin2": "MacLatin2", 

85 "mac-turkish": "MacTurkish", 

86} 

87 

88 

89def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict: 

90 """Replace the encoding name using *mapping*, modifying *result* in-place.""" 

91 enc = result.get("encoding") 

92 if isinstance(enc, str): 

93 result["encoding"] = mapping.get(enc, enc) 

94 return result 

95 

96 

97def apply_preferred_superset( 

98 result: DetectionDict, 

99) -> DetectionDict: 

100 """Replace the encoding name with its preferred Windows/CP superset. 

101 

102 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

103 for fluent chaining. 

104 

105 :param result: A detection result dict containing an ``"encoding"`` key. 

106 :returns: The same *result* dict, modified in-place. 

107 """ 

108 return _remap_encoding(result, PREFERRED_SUPERSET) 

109 

110 

111# Deprecated alias — kept for external consumers. 

112apply_legacy_rename = apply_preferred_superset 

113 

114 

115def apply_compat_names( 

116 result: DetectionDict, 

117) -> DetectionDict: 

118 """Convert internal codec names to chardet 5.x/6.x compatible names. 

119 

120 Modifies the ``"encoding"`` value in *result* in-place and returns *result* 

121 for fluent chaining. 

122 

123 :param result: A detection result dict containing an ``"encoding"`` key. 

124 :returns: The same *result* dict, modified in-place. 

125 """ 

126 return _remap_encoding(result, _COMPAT_NAMES)