1"""Stage 13: post-processing rank corrections.
2
3After statistical scoring produces a ranked list of candidates, three
4byte-level evidence checks fix up the ranking when bigrams alone are
5insufficient:
6
71. **Confusion-group resolution** (delegated to :mod:`chardet.pipeline.confusion`)
8 — uses build-time-trained Unicode-category maps to break ties between
9 confusable encoding pairs.
102. **Niche Latin demotion** — when an obscure ISO/Windows Latin encoding
11 tops the ranking but the data contains none of its distinguishing bytes,
12 promote a common Western Latin candidate (ISO-8859-1, ISO-8859-15,
13 CP1252) to the top.
143. **KOI8-T promotion** — when KOI8-R wins but Tajik-specific bytes are
15 present, promote KOI8-T (which shares the same Cyrillic block but maps
16 different bytes to Tajik letters).
17
18Note: ``from __future__ import annotations`` is intentionally omitted because
19this module is compiled with mypyc, which does not support PEP 563 string
20annotations.
21"""
22
23from chardet.pipeline import DetectionResult
24from chardet.pipeline.confusion import resolve_confusion_groups
25
26# Common Western Latin encodings that share the iso-8859-1 character
27# repertoire for the byte values where iso-8859-10 is indistinguishable.
28# Used as swap targets when demoting iso-8859-10 — we prefer these over
29# iso-8859-10, but do not want to accidentally promote an unrelated encoding
30# (e.g. windows-1254).
31_COMMON_LATIN_ENCODINGS: frozenset[str] = frozenset(
32 {
33 "iso8859-1",
34 "iso8859-15",
35 "cp1252",
36 }
37)
38
39# Bytes where iso-8859-10 decodes to a different character than iso-8859-1.
40# Computed programmatically via:
41# {b for b in range(0x80, 0x100)
42# if bytes([b]).decode('iso-8859-10') != bytes([b]).decode('iso-8859-1')}
43_ISO_8859_10_DISTINGUISHING: frozenset[int] = frozenset(
44 {
45 0xA1,
46 0xA2,
47 0xA3,
48 0xA4,
49 0xA5,
50 0xA6,
51 0xA8,
52 0xA9,
53 0xAA,
54 0xAB,
55 0xAC,
56 0xAE,
57 0xAF,
58 0xB1,
59 0xB2,
60 0xB3,
61 0xB4,
62 0xB5,
63 0xB6,
64 0xB8,
65 0xB9,
66 0xBA,
67 0xBB,
68 0xBC,
69 0xBD,
70 0xBE,
71 0xBF,
72 0xC0,
73 0xC7,
74 0xC8,
75 0xCA,
76 0xCC,
77 0xD1,
78 0xD2,
79 0xD7,
80 0xD9,
81 0xE0,
82 0xE7,
83 0xE8,
84 0xEA,
85 0xEC,
86 0xF1,
87 0xF2,
88 0xF7,
89 0xF9,
90 0xFF,
91 }
92)
93
94# Bytes where iso-8859-14 decodes to a different character than iso-8859-1.
95# Computed programmatically via:
96# {b for b in range(0x80, 0x100)
97# if bytes([b]).decode('iso-8859-14') != bytes([b]).decode('iso-8859-1')}
98_ISO_8859_14_DISTINGUISHING: frozenset[int] = frozenset(
99 {
100 0xA1,
101 0xA2,
102 0xA4,
103 0xA5,
104 0xA6,
105 0xA8,
106 0xAA,
107 0xAB,
108 0xAC,
109 0xAF,
110 0xB0,
111 0xB1,
112 0xB2,
113 0xB3,
114 0xB4,
115 0xB5,
116 0xB7,
117 0xB8,
118 0xB9,
119 0xBA,
120 0xBB,
121 0xBC,
122 0xBD,
123 0xBE,
124 0xBF,
125 0xD0,
126 0xD7,
127 0xDE,
128 0xF0,
129 0xF7,
130 0xFE,
131 }
132)
133
134# Bytes where windows-1254 has Turkish-specific characters that differ from
135# windows-1252. Windows-1254 differs from windows-1252 at 8 byte positions.
136# Two (0x8E, 0x9E) are undefined in Windows-1254 but defined in Windows-1252;
137# these are excluded here because undefined bytes are not useful for
138# identifying Turkish text. The remaining six positions map to
139# Turkish-specific letters and are the primary distinguishing signal.
140_WINDOWS_1254_DISTINGUISHING: frozenset[int] = frozenset(
141 {0xD0, 0xDD, 0xDE, 0xF0, 0xFD, 0xFE}
142)
143
144# Bytes where HP-Roman8 maps to lowercase accented letters but ISO-8859-1
145# maps to uppercase letters. Real HP-Roman8 text (from HP-UX terminals)
146# contains these bytes; data misdetected as HP-Roman8 typically does not.
147# {b for b in range(0x80, 0x100)
148# if (unicodedata.category(bytes([b]).decode('hp-roman8')) == 'Ll'
149# and unicodedata.category(bytes([b]).decode('iso-8859-1')) == 'Lu')}
150_HP_ROMAN8_DISTINGUISHING: frozenset[int] = frozenset(
151 {
152 0xC0,
153 0xC1,
154 0xC2,
155 0xC3,
156 0xC4,
157 0xC5,
158 0xC6,
159 0xC7,
160 0xC8,
161 0xC9,
162 0xCA,
163 0xCB,
164 0xCC,
165 0xCD,
166 0xCE,
167 0xCF,
168 0xD1,
169 0xD4,
170 0xD5,
171 0xD6,
172 0xD9,
173 0xDD,
174 0xDE,
175 }
176)
177
178# Encodings that are often false positives when their distinguishing bytes
179# are absent. Keyed by encoding name -> frozenset of byte values where
180# that encoding differs from iso-8859-1 (or windows-1252 in the case of
181# windows-1254).
182_DEMOTION_CANDIDATES: dict[str, frozenset[int]] = {
183 "iso8859-10": _ISO_8859_10_DISTINGUISHING,
184 "iso8859-14": _ISO_8859_14_DISTINGUISHING,
185 "cp1254": _WINDOWS_1254_DISTINGUISHING,
186 "hp-roman8": _HP_ROMAN8_DISTINGUISHING,
187}
188
189# Bytes where KOI8-T maps to Tajik-specific Cyrillic letters but KOI8-R
190# maps to box-drawing characters. Presence of any of these bytes is strong
191# evidence for KOI8-T over KOI8-R.
192_KOI8_T_DISTINGUISHING: frozenset[int] = frozenset(
193 {0x80, 0x81, 0x83, 0x8A, 0x8C, 0x8D, 0x8E, 0x90, 0xA1, 0xA2, 0xA5, 0xB5}
194)
195
196
197def _should_demote(encoding: str, data: bytes) -> bool:
198 """Return True if encoding is a demotion candidate with no distinguishing bytes.
199
200 Checks whether any non-ASCII byte in *data* falls in the set of byte
201 values that decode differently under the given encoding vs iso-8859-1.
202 If none do, the data is equally valid under both encodings and there is
203 no byte-level evidence for preferring the candidate encoding.
204 """
205 distinguishing = _DEMOTION_CANDIDATES.get(encoding)
206 if distinguishing is None:
207 return False
208 return not any(b in distinguishing for b in data if b > 0x7F)
209
210
211def _demote_niche_latin(
212 data: bytes,
213 results: list[DetectionResult],
214) -> list[DetectionResult]:
215 """Demote niche Latin encodings when no distinguishing bytes are present.
216
217 Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
218 on data that contains only bytes shared with common Western Latin
219 encodings. When there is no byte-level evidence for the winning
220 encoding, promote the first common Western Latin candidate to the top and
221 push the demoted encoding to last.
222 """
223 if (
224 len(results) > 1
225 and results[0].encoding is not None
226 and _should_demote(results[0].encoding, data)
227 ):
228 demoted_encoding = results[0].encoding
229 top_conf = results[0].confidence
230 for r in results[1:]:
231 if r.encoding in _COMMON_LATIN_ENCODINGS:
232 promoted = DetectionResult(
233 r.encoding, top_conf, r.language, r.mime_type
234 )
235 others = [
236 x for x in results if x.encoding != demoted_encoding and x is not r
237 ]
238 demoted_entries = [x for x in results if x.encoding == demoted_encoding]
239 return [promoted, *others, *demoted_entries]
240 return results
241
242
243def _promote_koi8t(
244 data: bytes,
245 results: list[DetectionResult],
246) -> list[DetectionResult]:
247 """Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.
248
249 KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
250 making statistical discrimination difficult. However, KOI8-T maps 12
251 bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
252 box-drawing characters. If any of these bytes appear, KOI8-T is the
253 better match.
254 """
255 if not results or results[0].encoding != "koi8-r":
256 return results
257 # Check if KOI8-T is anywhere in the results
258 koi8t_idx = next((i for i, r in enumerate(results) if r.encoding == "koi8-t"), None)
259 if koi8t_idx is None:
260 return results
261 # Check for Tajik-specific bytes
262 if any(b in _KOI8_T_DISTINGUISHING for b in data if b > 0x7F):
263 koi8t_result = results[koi8t_idx]
264 top_conf = results[0].confidence
265 promoted = DetectionResult(
266 koi8t_result.encoding,
267 top_conf,
268 koi8t_result.language,
269 koi8t_result.mime_type,
270 )
271 others = [r for i, r in enumerate(results) if i != koi8t_idx]
272 return [promoted, *others]
273 return results
274
275
276def postprocess_results(
277 data: bytes,
278 results: list[DetectionResult],
279) -> list[DetectionResult]:
280 """Apply confusion-group resolution, niche Latin demotion, and KOI8-T promotion.
281
282 These three rank-correction steps run in sequence after statistical
283 scoring. Each step inspects byte-level evidence in *data* and may
284 re-order or replace entries in *results*.
285
286 :param data: The raw byte data the results were produced from.
287 :param results: A list of :class:`DetectionResult` ranked by confidence.
288 :returns: A new list (or the same list) with rank corrections applied.
289 """
290 results = resolve_confusion_groups(data, results)
291 results = _demote_niche_latin(data, results)
292 return _promote_koi8t(data, results)