Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/postprocess.py: 33%

1"""Stage 13: post-processing rank corrections.

3After statistical scoring produces a ranked list of candidates, three

4byte-level evidence checks fix up the ranking when bigrams alone are

5insufficient:

71. **Confusion-group resolution** (delegated to :mod:`chardet.pipeline.confusion`)

8 — uses build-time-trained Unicode-category maps to break ties between

9 confusable encoding pairs.

102. **Niche Latin demotion** — when an obscure ISO/Windows Latin encoding

11 tops the ranking but the data contains none of its distinguishing bytes,

12 promote a common Western Latin candidate (ISO-8859-1, ISO-8859-15,

13 CP1252) to the top.

143. **KOI8-T promotion** — when KOI8-R wins but Tajik-specific bytes are

15 present, promote KOI8-T (which shares the same Cyrillic block but maps

16 different bytes to Tajik letters).

18Note: ``from __future__ import annotations`` is intentionally omitted because

19this module is compiled with mypyc, which does not support PEP 563 string

20annotations.

21"""

23from chardet.pipeline import DetectionResult

24from chardet.pipeline.confusion import resolve_confusion_groups

26# Common Western Latin encodings that share the iso-8859-1 character

27# repertoire for the byte values where iso-8859-10 is indistinguishable.

28# Used as swap targets when demoting iso-8859-10 — we prefer these over

29# iso-8859-10, but do not want to accidentally promote an unrelated encoding

30# (e.g. windows-1254).

31_COMMON_LATIN_ENCODINGS: frozenset[str] = frozenset(

32 {

33 "iso8859-1",

34 "iso8859-15",

35 "cp1252",

36 }

37)

39# Bytes where iso-8859-10 decodes to a different character than iso-8859-1.

40# Computed programmatically via:

41# {b for b in range(0x80, 0x100)

42# if bytes([b]).decode('iso-8859-10') != bytes([b]).decode('iso-8859-1')}

43_ISO_8859_10_DISTINGUISHING: frozenset[int] = frozenset(

44 {

45 0xA1,

46 0xA2,

47 0xA3,

48 0xA4,

49 0xA5,

50 0xA6,

51 0xA8,

52 0xA9,

53 0xAA,

54 0xAB,

55 0xAC,

56 0xAE,

57 0xAF,

58 0xB1,

59 0xB2,

60 0xB3,

61 0xB4,

62 0xB5,

63 0xB6,

64 0xB8,

65 0xB9,

66 0xBA,

67 0xBB,

68 0xBC,

69 0xBD,

70 0xBE,

71 0xBF,

72 0xC0,

73 0xC7,

74 0xC8,

75 0xCA,

76 0xCC,

77 0xD1,

78 0xD2,

79 0xD7,

80 0xD9,

81 0xE0,

82 0xE7,

83 0xE8,

84 0xEA,

85 0xEC,

86 0xF1,

87 0xF2,

88 0xF7,

89 0xF9,

90 0xFF,

91 }

92)

94# Bytes where iso-8859-14 decodes to a different character than iso-8859-1.

95# Computed programmatically via:

96# {b for b in range(0x80, 0x100)

97# if bytes([b]).decode('iso-8859-14') != bytes([b]).decode('iso-8859-1')}

98_ISO_8859_14_DISTINGUISHING: frozenset[int] = frozenset(

99 {

100 0xA1,

101 0xA2,

102 0xA4,

103 0xA5,

104 0xA6,

105 0xA8,

106 0xAA,

107 0xAB,

108 0xAC,

109 0xAF,

110 0xB0,

111 0xB1,

112 0xB2,

113 0xB3,

114 0xB4,

115 0xB5,

116 0xB7,

117 0xB8,

118 0xB9,

119 0xBA,

120 0xBB,

121 0xBC,

122 0xBD,

123 0xBE,

124 0xBF,

125 0xD0,

126 0xD7,

127 0xDE,

128 0xF0,

129 0xF7,

130 0xFE,

131 }

132)

133

134# Bytes where windows-1254 has Turkish-specific characters that differ from

135# windows-1252. Windows-1254 differs from windows-1252 at 8 byte positions.

136# Two (0x8E, 0x9E) are undefined in Windows-1254 but defined in Windows-1252;

137# these are excluded here because undefined bytes are not useful for

138# identifying Turkish text. The remaining six positions map to

139# Turkish-specific letters and are the primary distinguishing signal.

140_WINDOWS_1254_DISTINGUISHING: frozenset[int] = frozenset(

141 {0xD0, 0xDD, 0xDE, 0xF0, 0xFD, 0xFE}

142)

143

144# Bytes where HP-Roman8 maps to lowercase accented letters but ISO-8859-1

145# maps to uppercase letters. Real HP-Roman8 text (from HP-UX terminals)

146# contains these bytes; data misdetected as HP-Roman8 typically does not.

147# {b for b in range(0x80, 0x100)

148# if (unicodedata.category(bytes([b]).decode('hp-roman8')) == 'Ll'

149# and unicodedata.category(bytes([b]).decode('iso-8859-1')) == 'Lu')}

150_HP_ROMAN8_DISTINGUISHING: frozenset[int] = frozenset(

151 {

152 0xC0,

153 0xC1,

154 0xC2,

155 0xC3,

156 0xC4,

157 0xC5,

158 0xC6,

159 0xC7,

160 0xC8,

161 0xC9,

162 0xCA,

163 0xCB,

164 0xCC,

165 0xCD,

166 0xCE,

167 0xCF,

168 0xD1,

169 0xD4,

170 0xD5,

171 0xD6,

172 0xD9,

173 0xDD,

174 0xDE,

175 }

176)

177

178# Encodings that are often false positives when their distinguishing bytes

179# are absent. Keyed by encoding name -> frozenset of byte values where

180# that encoding differs from iso-8859-1 (or windows-1252 in the case of

181# windows-1254).

182_DEMOTION_CANDIDATES: dict[str, frozenset[int]] = {

183 "iso8859-10": _ISO_8859_10_DISTINGUISHING,

184 "iso8859-14": _ISO_8859_14_DISTINGUISHING,

185 "cp1254": _WINDOWS_1254_DISTINGUISHING,

186 "hp-roman8": _HP_ROMAN8_DISTINGUISHING,

187}

188

189# Bytes where KOI8-T maps to Tajik-specific Cyrillic letters but KOI8-R

190# maps to box-drawing characters. Presence of any of these bytes is strong

191# evidence for KOI8-T over KOI8-R.

192_KOI8_T_DISTINGUISHING: frozenset[int] = frozenset(

193 {0x80, 0x81, 0x83, 0x8A, 0x8C, 0x8D, 0x8E, 0x90, 0xA1, 0xA2, 0xA5, 0xB5}

194)

195

196

197def _should_demote(encoding: str, data: bytes) -> bool:

198 """Return True if encoding is a demotion candidate with no distinguishing bytes.

199

200 Checks whether any non-ASCII byte in *data* falls in the set of byte

201 values that decode differently under the given encoding vs iso-8859-1.

202 If none do, the data is equally valid under both encodings and there is

203 no byte-level evidence for preferring the candidate encoding.

204 """

205 distinguishing = _DEMOTION_CANDIDATES.get(encoding)

206 if distinguishing is None:

207 return False

208 return not any(b in distinguishing for b in data if b > 0x7F)

209

210

211def _demote_niche_latin(

212 data: bytes,

213 results: list[DetectionResult],

214) -> list[DetectionResult]:

215 """Demote niche Latin encodings when no distinguishing bytes are present.

216

217 Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win

218 on data that contains only bytes shared with common Western Latin

219 encodings. When there is no byte-level evidence for the winning

220 encoding, promote the first common Western Latin candidate to the top and

221 push the demoted encoding to last.

222 """

223 if (

224 len(results) > 1

225 and results[0].encoding is not None

226 and _should_demote(results[0].encoding, data)

227 ):

228 demoted_encoding = results[0].encoding

229 top_conf = results[0].confidence

230 for r in results[1:]:

231 if r.encoding in _COMMON_LATIN_ENCODINGS:

232 promoted = DetectionResult(

233 r.encoding, top_conf, r.language, r.mime_type

234 )

235 others = [

236 x for x in results if x.encoding != demoted_encoding and x is not r

237 ]

238 demoted_entries = [x for x in results if x.encoding == demoted_encoding]

239 return [promoted, *others, *demoted_entries]

240 return results

241

242

243def _promote_koi8t(

244 data: bytes,

245 results: list[DetectionResult],

246) -> list[DetectionResult]:

247 """Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

248

249 KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,

250 making statistical discrimination difficult. However, KOI8-T maps 12

251 bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has

252 box-drawing characters. If any of these bytes appear, KOI8-T is the

253 better match.

254 """

255 if not results or results[0].encoding != "koi8-r":

256 return results

257 # Check if KOI8-T is anywhere in the results

258 koi8t_idx = next((i for i, r in enumerate(results) if r.encoding == "koi8-t"), None)

259 if koi8t_idx is None:

260 return results

261 # Check for Tajik-specific bytes

262 if any(b in _KOI8_T_DISTINGUISHING for b in data if b > 0x7F):

263 koi8t_result = results[koi8t_idx]

264 top_conf = results[0].confidence

265 promoted = DetectionResult(

266 koi8t_result.encoding,

267 top_conf,

268 koi8t_result.language,

269 koi8t_result.mime_type,

270 )

271 others = [r for i, r in enumerate(results) if i != koi8t_idx]

272 return [promoted, *others]

273 return results

274

275

276def postprocess_results(

277 data: bytes,

278 results: list[DetectionResult],

279) -> list[DetectionResult]:

280 """Apply confusion-group resolution, niche Latin demotion, and KOI8-T promotion.

281

282 These three rank-correction steps run in sequence after statistical

283 scoring. Each step inspects byte-level evidence in *data* and may

284 re-order or replace entries in *results*.

285

286 :param data: The raw byte data the results were produced from.

287 :param results: A list of :class:`DetectionResult` ranked by confidence.

288 :returns: A new list (or the same list) with rank corrections applied.

289 """

290 results = resolve_confusion_groups(data, results)

291 results = _demote_niche_latin(data, results)

292 return _promote_koi8t(data, results)