Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/postprocess.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

43 statements  

1"""Stage 13: post-processing rank corrections. 

2 

3After statistical scoring produces a ranked list of candidates, three 

4byte-level evidence checks fix up the ranking when bigrams alone are 

5insufficient: 

6 

71. **Confusion-group resolution** (delegated to :mod:`chardet.pipeline.confusion`) 

8 — uses build-time-trained Unicode-category maps to break ties between 

9 confusable encoding pairs. 

102. **Niche Latin demotion** — when an obscure ISO/Windows Latin encoding 

11 tops the ranking but the data contains none of its distinguishing bytes, 

12 promote a common Western Latin candidate (ISO-8859-1, ISO-8859-15, 

13 CP1252) to the top. 

143. **KOI8-T promotion** — when KOI8-R wins but Tajik-specific bytes are 

15 present, promote KOI8-T (which shares the same Cyrillic block but maps 

16 different bytes to Tajik letters). 

17 

18Note: ``from __future__ import annotations`` is intentionally omitted because 

19this module is compiled with mypyc, which does not support PEP 563 string 

20annotations. 

21""" 

22 

23from chardet.pipeline import DetectionResult 

24from chardet.pipeline.confusion import resolve_confusion_groups 

25 

26# Common Western Latin encodings that share the iso-8859-1 character 

27# repertoire for the byte values where iso-8859-10 is indistinguishable. 

28# Used as swap targets when demoting iso-8859-10 — we prefer these over 

29# iso-8859-10, but do not want to accidentally promote an unrelated encoding 

30# (e.g. windows-1254). 

31_COMMON_LATIN_ENCODINGS: frozenset[str] = frozenset( 

32 { 

33 "iso8859-1", 

34 "iso8859-15", 

35 "cp1252", 

36 } 

37) 

38 

39# Bytes where iso-8859-10 decodes to a different character than iso-8859-1. 

40# Computed programmatically via: 

41# {b for b in range(0x80, 0x100) 

42# if bytes([b]).decode('iso-8859-10') != bytes([b]).decode('iso-8859-1')} 

43_ISO_8859_10_DISTINGUISHING: frozenset[int] = frozenset( 

44 { 

45 0xA1, 

46 0xA2, 

47 0xA3, 

48 0xA4, 

49 0xA5, 

50 0xA6, 

51 0xA8, 

52 0xA9, 

53 0xAA, 

54 0xAB, 

55 0xAC, 

56 0xAE, 

57 0xAF, 

58 0xB1, 

59 0xB2, 

60 0xB3, 

61 0xB4, 

62 0xB5, 

63 0xB6, 

64 0xB8, 

65 0xB9, 

66 0xBA, 

67 0xBB, 

68 0xBC, 

69 0xBD, 

70 0xBE, 

71 0xBF, 

72 0xC0, 

73 0xC7, 

74 0xC8, 

75 0xCA, 

76 0xCC, 

77 0xD1, 

78 0xD2, 

79 0xD7, 

80 0xD9, 

81 0xE0, 

82 0xE7, 

83 0xE8, 

84 0xEA, 

85 0xEC, 

86 0xF1, 

87 0xF2, 

88 0xF7, 

89 0xF9, 

90 0xFF, 

91 } 

92) 

93 

94# Bytes where iso-8859-14 decodes to a different character than iso-8859-1. 

95# Computed programmatically via: 

96# {b for b in range(0x80, 0x100) 

97# if bytes([b]).decode('iso-8859-14') != bytes([b]).decode('iso-8859-1')} 

98_ISO_8859_14_DISTINGUISHING: frozenset[int] = frozenset( 

99 { 

100 0xA1, 

101 0xA2, 

102 0xA4, 

103 0xA5, 

104 0xA6, 

105 0xA8, 

106 0xAA, 

107 0xAB, 

108 0xAC, 

109 0xAF, 

110 0xB0, 

111 0xB1, 

112 0xB2, 

113 0xB3, 

114 0xB4, 

115 0xB5, 

116 0xB7, 

117 0xB8, 

118 0xB9, 

119 0xBA, 

120 0xBB, 

121 0xBC, 

122 0xBD, 

123 0xBE, 

124 0xBF, 

125 0xD0, 

126 0xD7, 

127 0xDE, 

128 0xF0, 

129 0xF7, 

130 0xFE, 

131 } 

132) 

133 

134# Bytes where windows-1254 has Turkish-specific characters that differ from 

135# windows-1252. Windows-1254 differs from windows-1252 at 8 byte positions. 

136# Two (0x8E, 0x9E) are undefined in Windows-1254 but defined in Windows-1252; 

137# these are excluded here because undefined bytes are not useful for 

138# identifying Turkish text. The remaining six positions map to 

139# Turkish-specific letters and are the primary distinguishing signal. 

140_WINDOWS_1254_DISTINGUISHING: frozenset[int] = frozenset( 

141 {0xD0, 0xDD, 0xDE, 0xF0, 0xFD, 0xFE} 

142) 

143 

144# Bytes where HP-Roman8 maps to lowercase accented letters but ISO-8859-1 

145# maps to uppercase letters. Real HP-Roman8 text (from HP-UX terminals) 

146# contains these bytes; data misdetected as HP-Roman8 typically does not. 

147# {b for b in range(0x80, 0x100) 

148# if (unicodedata.category(bytes([b]).decode('hp-roman8')) == 'Ll' 

149# and unicodedata.category(bytes([b]).decode('iso-8859-1')) == 'Lu')} 

150_HP_ROMAN8_DISTINGUISHING: frozenset[int] = frozenset( 

151 { 

152 0xC0, 

153 0xC1, 

154 0xC2, 

155 0xC3, 

156 0xC4, 

157 0xC5, 

158 0xC6, 

159 0xC7, 

160 0xC8, 

161 0xC9, 

162 0xCA, 

163 0xCB, 

164 0xCC, 

165 0xCD, 

166 0xCE, 

167 0xCF, 

168 0xD1, 

169 0xD4, 

170 0xD5, 

171 0xD6, 

172 0xD9, 

173 0xDD, 

174 0xDE, 

175 } 

176) 

177 

178# Encodings that are often false positives when their distinguishing bytes 

179# are absent. Keyed by encoding name -> frozenset of byte values where 

180# that encoding differs from iso-8859-1 (or windows-1252 in the case of 

181# windows-1254). 

182_DEMOTION_CANDIDATES: dict[str, frozenset[int]] = { 

183 "iso8859-10": _ISO_8859_10_DISTINGUISHING, 

184 "iso8859-14": _ISO_8859_14_DISTINGUISHING, 

185 "cp1254": _WINDOWS_1254_DISTINGUISHING, 

186 "hp-roman8": _HP_ROMAN8_DISTINGUISHING, 

187} 

188 

189# Bytes where KOI8-T maps to Tajik-specific Cyrillic letters but KOI8-R 

190# maps to box-drawing characters. Presence of any of these bytes is strong 

191# evidence for KOI8-T over KOI8-R. 

192_KOI8_T_DISTINGUISHING: frozenset[int] = frozenset( 

193 {0x80, 0x81, 0x83, 0x8A, 0x8C, 0x8D, 0x8E, 0x90, 0xA1, 0xA2, 0xA5, 0xB5} 

194) 

195 

196 

197def _should_demote(encoding: str, data: bytes) -> bool: 

198 """Return True if encoding is a demotion candidate with no distinguishing bytes. 

199 

200 Checks whether any non-ASCII byte in *data* falls in the set of byte 

201 values that decode differently under the given encoding vs iso-8859-1. 

202 If none do, the data is equally valid under both encodings and there is 

203 no byte-level evidence for preferring the candidate encoding. 

204 """ 

205 distinguishing = _DEMOTION_CANDIDATES.get(encoding) 

206 if distinguishing is None: 

207 return False 

208 return not any(b in distinguishing for b in data if b > 0x7F) 

209 

210 

211def _demote_niche_latin( 

212 data: bytes, 

213 results: list[DetectionResult], 

214) -> list[DetectionResult]: 

215 """Demote niche Latin encodings when no distinguishing bytes are present. 

216 

217 Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win 

218 on data that contains only bytes shared with common Western Latin 

219 encodings. When there is no byte-level evidence for the winning 

220 encoding, promote the first common Western Latin candidate to the top and 

221 push the demoted encoding to last. 

222 """ 

223 if ( 

224 len(results) > 1 

225 and results[0].encoding is not None 

226 and _should_demote(results[0].encoding, data) 

227 ): 

228 demoted_encoding = results[0].encoding 

229 top_conf = results[0].confidence 

230 for r in results[1:]: 

231 if r.encoding in _COMMON_LATIN_ENCODINGS: 

232 promoted = DetectionResult( 

233 r.encoding, top_conf, r.language, r.mime_type 

234 ) 

235 others = [ 

236 x for x in results if x.encoding != demoted_encoding and x is not r 

237 ] 

238 demoted_entries = [x for x in results if x.encoding == demoted_encoding] 

239 return [promoted, *others, *demoted_entries] 

240 return results 

241 

242 

243def _promote_koi8t( 

244 data: bytes, 

245 results: list[DetectionResult], 

246) -> list[DetectionResult]: 

247 """Promote KOI8-T over KOI8-R when Tajik-specific bytes are present. 

248 

249 KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block, 

250 making statistical discrimination difficult. However, KOI8-T maps 12 

251 bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has 

252 box-drawing characters. If any of these bytes appear, KOI8-T is the 

253 better match. 

254 """ 

255 if not results or results[0].encoding != "koi8-r": 

256 return results 

257 # Check if KOI8-T is anywhere in the results 

258 koi8t_idx = next((i for i, r in enumerate(results) if r.encoding == "koi8-t"), None) 

259 if koi8t_idx is None: 

260 return results 

261 # Check for Tajik-specific bytes 

262 if any(b in _KOI8_T_DISTINGUISHING for b in data if b > 0x7F): 

263 koi8t_result = results[koi8t_idx] 

264 top_conf = results[0].confidence 

265 promoted = DetectionResult( 

266 koi8t_result.encoding, 

267 top_conf, 

268 koi8t_result.language, 

269 koi8t_result.mime_type, 

270 ) 

271 others = [r for i, r in enumerate(results) if i != koi8t_idx] 

272 return [promoted, *others] 

273 return results 

274 

275 

276def postprocess_results( 

277 data: bytes, 

278 results: list[DetectionResult], 

279) -> list[DetectionResult]: 

280 """Apply confusion-group resolution, niche Latin demotion, and KOI8-T promotion. 

281 

282 These three rank-correction steps run in sequence after statistical 

283 scoring. Each step inspects byte-level evidence in *data* and may 

284 re-order or replace entries in *results*. 

285 

286 :param data: The raw byte data the results were produced from. 

287 :param results: A list of :class:`DetectionResult` ranked by confidence. 

288 :returns: A new list (or the same list) with rank corrections applied. 

289 """ 

290 results = resolve_confusion_groups(data, results) 

291 results = _demote_niche_latin(data, results) 

292 return _promote_koi8t(data, results)