1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, see
25# <https://www.gnu.org/licenses/>.
26######################### END LICENSE BLOCK #########################
27
28import re
29from typing import Union
30
31from .charsetgroupprober import CharSetGroupProber
32from .enums import EncodingEra, LanguageFilter, ProbingState
33from .hebrewprober import HebrewProber
34from .langarabicmodel import (
35 CP720_ARABIC_MODEL,
36 CP864_ARABIC_MODEL,
37 ISO_8859_6_ARABIC_MODEL,
38 WINDOWS_1256_ARABIC_MODEL,
39)
40from .langbelarusianmodel import (
41 CP866_BELARUSIAN_MODEL,
42 ISO_8859_5_BELARUSIAN_MODEL,
43 MACCYRILLIC_BELARUSIAN_MODEL,
44 WINDOWS_1251_BELARUSIAN_MODEL,
45)
46from .langbretonmodel import (
47 CP037_BRETON_MODEL,
48 CP500_BRETON_MODEL,
49 ISO_8859_14_BRETON_MODEL,
50)
51from .langbulgarianmodel import (
52 CP855_BULGARIAN_MODEL,
53 ISO_8859_5_BULGARIAN_MODEL,
54 MACCYRILLIC_BULGARIAN_MODEL,
55 WINDOWS_1251_BULGARIAN_MODEL,
56)
57from .langcroatianmodel import (
58 CP852_CROATIAN_MODEL,
59 ISO_8859_2_CROATIAN_MODEL,
60 ISO_8859_16_CROATIAN_MODEL,
61 MACLATIN2_CROATIAN_MODEL,
62 WINDOWS_1250_CROATIAN_MODEL,
63)
64from .langczechmodel import (
65 ISO_8859_2_CZECH_MODEL,
66 WINDOWS_1250_CZECH_MODEL,
67)
68from .langdanishmodel import (
69 CP037_DANISH_MODEL,
70 CP500_DANISH_MODEL,
71 CP850_DANISH_MODEL,
72 CP858_DANISH_MODEL,
73 CP865_DANISH_MODEL,
74 ISO_8859_1_DANISH_MODEL,
75 ISO_8859_15_DANISH_MODEL,
76 MACROMAN_DANISH_MODEL,
77 WINDOWS_1252_DANISH_MODEL,
78)
79from .langdutchmodel import (
80 CP037_DUTCH_MODEL,
81 CP500_DUTCH_MODEL,
82 CP850_DUTCH_MODEL,
83 CP858_DUTCH_MODEL,
84 ISO_8859_1_DUTCH_MODEL,
85 ISO_8859_15_DUTCH_MODEL,
86 MACROMAN_DUTCH_MODEL,
87 WINDOWS_1252_DUTCH_MODEL,
88)
89from .langenglishmodel import (
90 CP037_ENGLISH_MODEL,
91 CP437_ENGLISH_MODEL,
92 CP500_ENGLISH_MODEL,
93 CP850_ENGLISH_MODEL,
94 CP858_ENGLISH_MODEL,
95 ISO_8859_1_ENGLISH_MODEL,
96 ISO_8859_15_ENGLISH_MODEL,
97 MACROMAN_ENGLISH_MODEL,
98 WINDOWS_1252_ENGLISH_MODEL,
99)
100from .langesperantomodel import ISO_8859_3_ESPERANTO_MODEL
101from .langestonianmodel import (
102 CP775_ESTONIAN_MODEL,
103 ISO_8859_4_ESTONIAN_MODEL,
104 ISO_8859_13_ESTONIAN_MODEL,
105 WINDOWS_1257_ESTONIAN_MODEL,
106)
107from .langfarsimodel import (
108 ISO_8859_6_FARSI_MODEL,
109 WINDOWS_1256_FARSI_MODEL,
110)
111from .langfinnishmodel import (
112 CP037_FINNISH_MODEL,
113 CP500_FINNISH_MODEL,
114 CP850_FINNISH_MODEL,
115 CP858_FINNISH_MODEL,
116 ISO_8859_1_FINNISH_MODEL,
117 ISO_8859_15_FINNISH_MODEL,
118 MACROMAN_FINNISH_MODEL,
119 WINDOWS_1252_FINNISH_MODEL,
120)
121from .langfrenchmodel import (
122 CP037_FRENCH_MODEL,
123 CP500_FRENCH_MODEL,
124 CP850_FRENCH_MODEL,
125 CP858_FRENCH_MODEL,
126 CP863_FRENCH_MODEL,
127 ISO_8859_1_FRENCH_MODEL,
128 ISO_8859_15_FRENCH_MODEL,
129 MACROMAN_FRENCH_MODEL,
130 WINDOWS_1252_FRENCH_MODEL,
131)
132from .langgermanmodel import (
133 CP037_GERMAN_MODEL,
134 CP500_GERMAN_MODEL,
135 CP850_GERMAN_MODEL,
136 CP858_GERMAN_MODEL,
137 ISO_8859_1_GERMAN_MODEL,
138 ISO_8859_15_GERMAN_MODEL,
139 MACROMAN_GERMAN_MODEL,
140 WINDOWS_1252_GERMAN_MODEL,
141)
142from .langgreekmodel import (
143 CP737_GREEK_MODEL,
144 CP869_GREEK_MODEL,
145 CP875_GREEK_MODEL,
146 ISO_8859_7_GREEK_MODEL,
147 MACGREEK_GREEK_MODEL,
148 WINDOWS_1253_GREEK_MODEL,
149)
150from .langhebrewmodel import (
151 CP424_HEBREW_MODEL,
152 CP856_HEBREW_MODEL,
153 CP862_HEBREW_MODEL,
154 ISO_8859_8_HEBREW_MODEL,
155 WINDOWS_1255_HEBREW_MODEL,
156)
157from .langhungarianmodel import (
158 CP852_HUNGARIAN_MODEL,
159 ISO_8859_2_HUNGARIAN_MODEL,
160 ISO_8859_16_HUNGARIAN_MODEL,
161 MACLATIN2_HUNGARIAN_MODEL,
162 WINDOWS_1250_HUNGARIAN_MODEL,
163)
164from .langicelandicmodel import (
165 CP037_ICELANDIC_MODEL,
166 CP500_ICELANDIC_MODEL,
167 CP861_ICELANDIC_MODEL,
168 ISO_8859_1_ICELANDIC_MODEL,
169 ISO_8859_10_ICELANDIC_MODEL,
170 MACICELAND_ICELANDIC_MODEL,
171)
172from .langindonesianmodel import (
173 CP037_INDONESIAN_MODEL,
174 CP500_INDONESIAN_MODEL,
175 ISO_8859_1_INDONESIAN_MODEL,
176 MACROMAN_INDONESIAN_MODEL,
177 WINDOWS_1252_INDONESIAN_MODEL,
178)
179from .langirishmodel import (
180 CP037_IRISH_MODEL,
181 CP500_IRISH_MODEL,
182 ISO_8859_14_IRISH_MODEL,
183)
184from .langitalianmodel import (
185 CP037_ITALIAN_MODEL,
186 CP500_ITALIAN_MODEL,
187 CP850_ITALIAN_MODEL,
188 CP858_ITALIAN_MODEL,
189 ISO_8859_1_ITALIAN_MODEL,
190 ISO_8859_15_ITALIAN_MODEL,
191 MACROMAN_ITALIAN_MODEL,
192 WINDOWS_1252_ITALIAN_MODEL,
193)
194from .langkazakhmodel import (
195 KZ1048_KAZAKH_MODEL,
196 PTCP154_KAZAKH_MODEL,
197)
198from .langlatvianmodel import (
199 CP775_LATVIAN_MODEL,
200 ISO_8859_4_LATVIAN_MODEL,
201 ISO_8859_13_LATVIAN_MODEL,
202 WINDOWS_1257_LATVIAN_MODEL,
203)
204from .langlithuanianmodel import (
205 CP775_LITHUANIAN_MODEL,
206 ISO_8859_4_LITHUANIAN_MODEL,
207 ISO_8859_13_LITHUANIAN_MODEL,
208 WINDOWS_1257_LITHUANIAN_MODEL,
209)
210from .langmacedonianmodel import (
211 CP855_MACEDONIAN_MODEL,
212 ISO_8859_5_MACEDONIAN_MODEL,
213 MACCYRILLIC_MACEDONIAN_MODEL,
214 WINDOWS_1251_MACEDONIAN_MODEL,
215)
216from .langmalaymodel import (
217 CP037_MALAY_MODEL,
218 CP500_MALAY_MODEL,
219 ISO_8859_1_MALAY_MODEL,
220 MACROMAN_MALAY_MODEL,
221 WINDOWS_1252_MALAY_MODEL,
222)
223from .langmaltesemodel import ISO_8859_3_MALTESE_MODEL
224from .langnorwegianmodel import (
225 CP037_NORWEGIAN_MODEL,
226 CP500_NORWEGIAN_MODEL,
227 CP850_NORWEGIAN_MODEL,
228 CP858_NORWEGIAN_MODEL,
229 CP865_NORWEGIAN_MODEL,
230 ISO_8859_1_NORWEGIAN_MODEL,
231 ISO_8859_15_NORWEGIAN_MODEL,
232 MACROMAN_NORWEGIAN_MODEL,
233 WINDOWS_1252_NORWEGIAN_MODEL,
234)
235from .langpolishmodel import (
236 CP852_POLISH_MODEL,
237 ISO_8859_2_POLISH_MODEL,
238 ISO_8859_16_POLISH_MODEL,
239 MACLATIN2_POLISH_MODEL,
240 WINDOWS_1250_POLISH_MODEL,
241)
242from .langportuguesemodel import (
243 CP037_PORTUGUESE_MODEL,
244 CP500_PORTUGUESE_MODEL,
245 CP850_PORTUGUESE_MODEL,
246 CP858_PORTUGUESE_MODEL,
247 CP860_PORTUGUESE_MODEL,
248 ISO_8859_1_PORTUGUESE_MODEL,
249 ISO_8859_15_PORTUGUESE_MODEL,
250 MACROMAN_PORTUGUESE_MODEL,
251 WINDOWS_1252_PORTUGUESE_MODEL,
252)
253from .langromanianmodel import (
254 CP852_ROMANIAN_MODEL,
255 ISO_8859_2_ROMANIAN_MODEL,
256 ISO_8859_16_ROMANIAN_MODEL,
257 MACLATIN2_ROMANIAN_MODEL,
258 WINDOWS_1250_ROMANIAN_MODEL,
259)
260from .langrussianmodel import (
261 CP855_RUSSIAN_MODEL,
262 CP866_RUSSIAN_MODEL,
263 ISO_8859_5_RUSSIAN_MODEL,
264 KOI8_R_RUSSIAN_MODEL,
265 MACCYRILLIC_RUSSIAN_MODEL,
266 WINDOWS_1251_RUSSIAN_MODEL,
267)
268from .langscottishgaelicmodel import (
269 CP037_SCOTTISH_GAELIC_MODEL,
270 CP500_SCOTTISH_GAELIC_MODEL,
271 ISO_8859_14_SCOTTISH_GAELIC_MODEL,
272)
273from .langserbianmodel import (
274 CP855_SERBIAN_MODEL,
275 ISO_8859_5_SERBIAN_MODEL,
276 MACCYRILLIC_SERBIAN_MODEL,
277 WINDOWS_1251_SERBIAN_MODEL,
278)
279from .langslovakmodel import (
280 CP852_SLOVAK_MODEL,
281 ISO_8859_2_SLOVAK_MODEL,
282 ISO_8859_16_SLOVAK_MODEL,
283 MACLATIN2_SLOVAK_MODEL,
284 WINDOWS_1250_SLOVAK_MODEL,
285)
286from .langslovenemodel import (
287 CP852_SLOVENE_MODEL,
288 ISO_8859_2_SLOVENE_MODEL,
289 ISO_8859_16_SLOVENE_MODEL,
290 MACLATIN2_SLOVENE_MODEL,
291 WINDOWS_1250_SLOVENE_MODEL,
292)
293from .langspanishmodel import (
294 CP037_SPANISH_MODEL,
295 CP500_SPANISH_MODEL,
296 CP850_SPANISH_MODEL,
297 CP858_SPANISH_MODEL,
298 ISO_8859_1_SPANISH_MODEL,
299 ISO_8859_15_SPANISH_MODEL,
300 MACROMAN_SPANISH_MODEL,
301 WINDOWS_1252_SPANISH_MODEL,
302)
303from .langswedishmodel import (
304 CP037_SWEDISH_MODEL,
305 CP500_SWEDISH_MODEL,
306 CP850_SWEDISH_MODEL,
307 CP858_SWEDISH_MODEL,
308 ISO_8859_1_SWEDISH_MODEL,
309 ISO_8859_15_SWEDISH_MODEL,
310 MACROMAN_SWEDISH_MODEL,
311 WINDOWS_1252_SWEDISH_MODEL,
312)
313from .langtajikmodel import KOI8_T_TAJIK_MODEL
314from .langthaimodel import (
315 CP874_THAI_MODEL,
316 ISO_8859_11_THAI_MODEL,
317 TIS_620_THAI_MODEL,
318)
319from .langturkishmodel import (
320 CP857_TURKISH_MODEL,
321 CP1026_TURKISH_MODEL,
322 ISO_8859_3_TURKISH_MODEL,
323 ISO_8859_9_TURKISH_MODEL,
324 MACTURKISH_TURKISH_MODEL,
325 WINDOWS_1254_TURKISH_MODEL,
326)
327from .langukrainianmodel import (
328 CP1125_UKRAINIAN_MODEL,
329 ISO_8859_5_UKRAINIAN_MODEL,
330 KOI8_U_UKRAINIAN_MODEL,
331 MACCYRILLIC_UKRAINIAN_MODEL,
332 WINDOWS_1251_UKRAINIAN_MODEL,
333)
334from .langvietnamesemodel import WINDOWS_1258_VIETNAMESE_MODEL
335from .langwelshmodel import (
336 CP037_WELSH_MODEL,
337 CP500_WELSH_MODEL,
338 ISO_8859_14_WELSH_MODEL,
339)
340from .sbcharsetprober import SingleByteCharSetProber
341
342# Byte pattern detectors for single-byte encoding disambiguation
343# Bytes in 0x80-0x9F range have different meanings in different encoding families:
344# - Windows encodings: Smart quotes, dashes, currency symbols (printable punctuation)
345# - Mac encodings: Accented letters and diacriticals (printable letters)
346# - ISO-8859-x: Control characters (C1 control codes, mostly unprintable)
347
348# Detect any byte in the Windows/Mac range
349WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")
350
351# Detect Mac-only letter bytes for Latin encodings (letters in Mac, control/punct in Win/ISO)
352MAC_LATIN_ONLY_LETTER_DETECTOR = re.compile(b"[\x81\x8d\x8f\x90\x92\x9d]")
353
354# Detect MacCyrillic-only letter bytes (Cyrillic letters in Mac, punctuation in Windows-1251)
355MAC_CYRILLIC_ONLY_LETTER_DETECTOR = re.compile(
356 b"[\x82\x84\x85\x86\x87\x88\x89\x8b\x91\x92\x93\x94\x95\x96\x97\x99\x9b]"
357)
358
359# Detect Mac letter bytes appearing between word characters (suggests Mac encoding)
360MAC_LETTER_IN_WORD_DETECTOR = re.compile(b"[a-zA-Z][\x80-\x9f][a-zA-Z]")
361
362# Detect Euro sign (0xA4 in ISO-8859-15, but generic currency ¤ in ISO-8859-1)
363EURO_SIGN_DETECTOR = re.compile(b"\xa4")
364
365# Latin encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
366CONFUSED_LATIN_ENCODINGS = frozenset({
367 "macroman",
368 "windows-1252",
369 "iso-8859-1",
370 "iso-8859-15",
371 "macgreek",
372 "windows-1253",
373 "iso-8859-7",
374 "macturkish",
375 "windows-1254",
376 "iso-8859-9",
377 "iso-8859-3", # Turkish/Maltese/Esperanto (also works with MacTurkish/Win-1254)
378 "maciceland",
379 "iso-8859-10",
380 "iso-8859-14",
381})
382
383# Central European encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
384CONFUSED_CENTRAL_EUROPEAN_ENCODINGS = frozenset({
385 "maclatin2",
386 "windows-1250",
387 "iso-8859-2",
388 "iso-8859-16", # Southeast European/Romanian (close to Latin-2)
389})
390
391# Cyrillic encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
392CONFUSED_CYRILLIC_ENCODINGS = frozenset({
393 "maccyrillic",
394 "windows-1251",
395 "iso-8859-5",
396})
397
398# Map ISO encodings to their Windows equivalents
399ISO_WIN_MAP = {
400 "iso-8859-1": "Windows-1252",
401 "iso-8859-2": "Windows-1250",
402 "iso-8859-5": "Windows-1251",
403 "iso-8859-6": "Windows-1256",
404 "iso-8859-7": "Windows-1253",
405 "iso-8859-8": "Windows-1255",
406 "iso-8859-9": "Windows-1254",
407 "iso-8859-13": "Windows-1257",
408}
409
410
411class SBCSGroupProber(CharSetGroupProber):
412 def __init__(
413 self,
414 lang_filter: LanguageFilter = LanguageFilter.ALL,
415 encoding_era: EncodingEra = EncodingEra.MODERN_WEB,
416 ) -> None:
417 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era)
418
419 # Initialize byte pattern tracking for disambiguation heuristics
420 self._has_win_bytes = False
421 self._has_mac_latin_letter_pattern = False
422 self._has_mac_cyrillic_letter_pattern = False
423 self._has_euro_sign = False
424 self._input_bytes = bytearray()
425
426 hebrew_prober = HebrewProber()
427 logical_hebrew_prober = SingleByteCharSetProber(
428 WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
429 )
430 visual_hebrew_prober = SingleByteCharSetProber(
431 ISO_8859_8_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
432 )
433 hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
434
435 # TODO: ORDER MATTERS HERE. I changed the order vs what was in master
436 # and several tests failed that did not before. Some thought
437 # should be put into the ordering, and we should consider making
438 # order not matter here, because that is very counter-intuitive.
439 self.probers = [
440 SingleByteCharSetProber(CP720_ARABIC_MODEL),
441 SingleByteCharSetProber(CP864_ARABIC_MODEL),
442 SingleByteCharSetProber(ISO_8859_6_ARABIC_MODEL),
443 SingleByteCharSetProber(WINDOWS_1256_ARABIC_MODEL),
444 SingleByteCharSetProber(CP866_BELARUSIAN_MODEL),
445 SingleByteCharSetProber(ISO_8859_5_BELARUSIAN_MODEL),
446 SingleByteCharSetProber(MACCYRILLIC_BELARUSIAN_MODEL),
447 SingleByteCharSetProber(WINDOWS_1251_BELARUSIAN_MODEL),
448 SingleByteCharSetProber(ISO_8859_14_BRETON_MODEL),
449 SingleByteCharSetProber(CP037_BRETON_MODEL),
450 SingleByteCharSetProber(CP500_BRETON_MODEL),
451 SingleByteCharSetProber(CP855_BULGARIAN_MODEL),
452 SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
453 SingleByteCharSetProber(MACCYRILLIC_BULGARIAN_MODEL),
454 SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
455 SingleByteCharSetProber(CP852_CROATIAN_MODEL),
456 SingleByteCharSetProber(ISO_8859_16_CROATIAN_MODEL),
457 SingleByteCharSetProber(ISO_8859_2_CROATIAN_MODEL),
458 SingleByteCharSetProber(MACLATIN2_CROATIAN_MODEL),
459 SingleByteCharSetProber(WINDOWS_1250_CROATIAN_MODEL),
460 SingleByteCharSetProber(ISO_8859_2_CZECH_MODEL),
461 SingleByteCharSetProber(WINDOWS_1250_CZECH_MODEL),
462 SingleByteCharSetProber(CP037_DANISH_MODEL),
463 SingleByteCharSetProber(CP500_DANISH_MODEL),
464 SingleByteCharSetProber(CP850_DANISH_MODEL),
465 SingleByteCharSetProber(CP858_DANISH_MODEL),
466 SingleByteCharSetProber(CP865_DANISH_MODEL),
467 SingleByteCharSetProber(ISO_8859_15_DANISH_MODEL),
468 SingleByteCharSetProber(ISO_8859_1_DANISH_MODEL),
469 SingleByteCharSetProber(MACROMAN_DANISH_MODEL),
470 SingleByteCharSetProber(WINDOWS_1252_DANISH_MODEL),
471 SingleByteCharSetProber(CP037_DUTCH_MODEL),
472 SingleByteCharSetProber(CP500_DUTCH_MODEL),
473 SingleByteCharSetProber(CP850_DUTCH_MODEL),
474 SingleByteCharSetProber(CP858_DUTCH_MODEL),
475 SingleByteCharSetProber(ISO_8859_15_DUTCH_MODEL),
476 SingleByteCharSetProber(ISO_8859_1_DUTCH_MODEL),
477 SingleByteCharSetProber(MACROMAN_DUTCH_MODEL),
478 SingleByteCharSetProber(WINDOWS_1252_DUTCH_MODEL),
479 SingleByteCharSetProber(CP037_ENGLISH_MODEL),
480 SingleByteCharSetProber(CP437_ENGLISH_MODEL),
481 SingleByteCharSetProber(CP500_ENGLISH_MODEL),
482 SingleByteCharSetProber(CP850_ENGLISH_MODEL),
483 SingleByteCharSetProber(CP858_ENGLISH_MODEL),
484 SingleByteCharSetProber(ISO_8859_15_ENGLISH_MODEL),
485 SingleByteCharSetProber(ISO_8859_1_ENGLISH_MODEL),
486 SingleByteCharSetProber(MACROMAN_ENGLISH_MODEL),
487 SingleByteCharSetProber(WINDOWS_1252_ENGLISH_MODEL),
488 SingleByteCharSetProber(ISO_8859_3_ESPERANTO_MODEL),
489 SingleByteCharSetProber(CP775_ESTONIAN_MODEL),
490 SingleByteCharSetProber(ISO_8859_13_ESTONIAN_MODEL),
491 SingleByteCharSetProber(ISO_8859_4_ESTONIAN_MODEL),
492 SingleByteCharSetProber(WINDOWS_1257_ESTONIAN_MODEL),
493 SingleByteCharSetProber(ISO_8859_6_FARSI_MODEL),
494 SingleByteCharSetProber(WINDOWS_1256_FARSI_MODEL),
495 SingleByteCharSetProber(CP037_FINNISH_MODEL),
496 SingleByteCharSetProber(CP500_FINNISH_MODEL),
497 SingleByteCharSetProber(CP850_FINNISH_MODEL),
498 SingleByteCharSetProber(CP858_FINNISH_MODEL),
499 SingleByteCharSetProber(ISO_8859_15_FINNISH_MODEL),
500 SingleByteCharSetProber(ISO_8859_1_FINNISH_MODEL),
501 SingleByteCharSetProber(MACROMAN_FINNISH_MODEL),
502 SingleByteCharSetProber(WINDOWS_1252_FINNISH_MODEL),
503 SingleByteCharSetProber(CP037_FRENCH_MODEL),
504 SingleByteCharSetProber(CP500_FRENCH_MODEL),
505 SingleByteCharSetProber(CP850_FRENCH_MODEL),
506 SingleByteCharSetProber(CP858_FRENCH_MODEL),
507 SingleByteCharSetProber(CP863_FRENCH_MODEL),
508 SingleByteCharSetProber(ISO_8859_15_FRENCH_MODEL),
509 SingleByteCharSetProber(ISO_8859_1_FRENCH_MODEL),
510 SingleByteCharSetProber(MACROMAN_FRENCH_MODEL),
511 SingleByteCharSetProber(WINDOWS_1252_FRENCH_MODEL),
512 SingleByteCharSetProber(CP037_GERMAN_MODEL),
513 SingleByteCharSetProber(CP500_GERMAN_MODEL),
514 SingleByteCharSetProber(CP850_GERMAN_MODEL),
515 SingleByteCharSetProber(CP858_GERMAN_MODEL),
516 SingleByteCharSetProber(ISO_8859_15_GERMAN_MODEL),
517 SingleByteCharSetProber(ISO_8859_1_GERMAN_MODEL),
518 SingleByteCharSetProber(MACROMAN_GERMAN_MODEL),
519 SingleByteCharSetProber(WINDOWS_1252_GERMAN_MODEL),
520 SingleByteCharSetProber(CP737_GREEK_MODEL),
521 SingleByteCharSetProber(CP869_GREEK_MODEL),
522 SingleByteCharSetProber(CP875_GREEK_MODEL),
523 SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
524 SingleByteCharSetProber(MACGREEK_GREEK_MODEL),
525 SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
526 SingleByteCharSetProber(CP424_HEBREW_MODEL, is_reversed=True),
527 SingleByteCharSetProber(CP856_HEBREW_MODEL, is_reversed=True),
528 SingleByteCharSetProber(CP862_HEBREW_MODEL, is_reversed=True),
529 hebrew_prober,
530 logical_hebrew_prober,
531 visual_hebrew_prober,
532 SingleByteCharSetProber(CP852_HUNGARIAN_MODEL),
533 SingleByteCharSetProber(ISO_8859_16_HUNGARIAN_MODEL),
534 SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
535 SingleByteCharSetProber(MACLATIN2_HUNGARIAN_MODEL),
536 SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
537 SingleByteCharSetProber(CP037_ICELANDIC_MODEL),
538 SingleByteCharSetProber(CP500_ICELANDIC_MODEL),
539 SingleByteCharSetProber(CP861_ICELANDIC_MODEL),
540 SingleByteCharSetProber(ISO_8859_10_ICELANDIC_MODEL),
541 SingleByteCharSetProber(ISO_8859_1_ICELANDIC_MODEL),
542 SingleByteCharSetProber(MACICELAND_ICELANDIC_MODEL),
543 SingleByteCharSetProber(CP037_INDONESIAN_MODEL),
544 SingleByteCharSetProber(CP500_INDONESIAN_MODEL),
545 SingleByteCharSetProber(ISO_8859_1_INDONESIAN_MODEL),
546 SingleByteCharSetProber(MACROMAN_INDONESIAN_MODEL),
547 SingleByteCharSetProber(WINDOWS_1252_INDONESIAN_MODEL),
548 SingleByteCharSetProber(ISO_8859_14_IRISH_MODEL),
549 SingleByteCharSetProber(CP037_IRISH_MODEL),
550 SingleByteCharSetProber(CP500_IRISH_MODEL),
551 SingleByteCharSetProber(CP037_ITALIAN_MODEL),
552 SingleByteCharSetProber(CP500_ITALIAN_MODEL),
553 SingleByteCharSetProber(CP850_ITALIAN_MODEL),
554 SingleByteCharSetProber(CP858_ITALIAN_MODEL),
555 SingleByteCharSetProber(ISO_8859_15_ITALIAN_MODEL),
556 SingleByteCharSetProber(ISO_8859_1_ITALIAN_MODEL),
557 SingleByteCharSetProber(MACROMAN_ITALIAN_MODEL),
558 SingleByteCharSetProber(WINDOWS_1252_ITALIAN_MODEL),
559 SingleByteCharSetProber(KZ1048_KAZAKH_MODEL),
560 SingleByteCharSetProber(PTCP154_KAZAKH_MODEL),
561 SingleByteCharSetProber(CP775_LATVIAN_MODEL),
562 SingleByteCharSetProber(ISO_8859_13_LATVIAN_MODEL),
563 SingleByteCharSetProber(ISO_8859_4_LATVIAN_MODEL),
564 SingleByteCharSetProber(WINDOWS_1257_LATVIAN_MODEL),
565 SingleByteCharSetProber(CP775_LITHUANIAN_MODEL),
566 SingleByteCharSetProber(ISO_8859_13_LITHUANIAN_MODEL),
567 SingleByteCharSetProber(ISO_8859_4_LITHUANIAN_MODEL),
568 SingleByteCharSetProber(WINDOWS_1257_LITHUANIAN_MODEL),
569 SingleByteCharSetProber(CP855_MACEDONIAN_MODEL),
570 SingleByteCharSetProber(ISO_8859_5_MACEDONIAN_MODEL),
571 SingleByteCharSetProber(MACCYRILLIC_MACEDONIAN_MODEL),
572 SingleByteCharSetProber(WINDOWS_1251_MACEDONIAN_MODEL),
573 SingleByteCharSetProber(CP037_MALAY_MODEL),
574 SingleByteCharSetProber(CP500_MALAY_MODEL),
575 SingleByteCharSetProber(ISO_8859_1_MALAY_MODEL),
576 SingleByteCharSetProber(MACROMAN_MALAY_MODEL),
577 SingleByteCharSetProber(WINDOWS_1252_MALAY_MODEL),
578 SingleByteCharSetProber(ISO_8859_3_MALTESE_MODEL),
579 SingleByteCharSetProber(CP037_NORWEGIAN_MODEL),
580 SingleByteCharSetProber(CP500_NORWEGIAN_MODEL),
581 SingleByteCharSetProber(CP850_NORWEGIAN_MODEL),
582 SingleByteCharSetProber(CP858_NORWEGIAN_MODEL),
583 SingleByteCharSetProber(CP865_NORWEGIAN_MODEL),
584 SingleByteCharSetProber(ISO_8859_15_NORWEGIAN_MODEL),
585 SingleByteCharSetProber(ISO_8859_1_NORWEGIAN_MODEL),
586 SingleByteCharSetProber(MACROMAN_NORWEGIAN_MODEL),
587 SingleByteCharSetProber(WINDOWS_1252_NORWEGIAN_MODEL),
588 SingleByteCharSetProber(CP852_POLISH_MODEL),
589 SingleByteCharSetProber(ISO_8859_16_POLISH_MODEL),
590 SingleByteCharSetProber(ISO_8859_2_POLISH_MODEL),
591 SingleByteCharSetProber(MACLATIN2_POLISH_MODEL),
592 SingleByteCharSetProber(WINDOWS_1250_POLISH_MODEL),
593 SingleByteCharSetProber(CP037_PORTUGUESE_MODEL),
594 SingleByteCharSetProber(CP500_PORTUGUESE_MODEL),
595 SingleByteCharSetProber(CP850_PORTUGUESE_MODEL),
596 SingleByteCharSetProber(CP858_PORTUGUESE_MODEL),
597 SingleByteCharSetProber(CP860_PORTUGUESE_MODEL),
598 SingleByteCharSetProber(ISO_8859_15_PORTUGUESE_MODEL),
599 SingleByteCharSetProber(ISO_8859_1_PORTUGUESE_MODEL),
600 SingleByteCharSetProber(MACROMAN_PORTUGUESE_MODEL),
601 SingleByteCharSetProber(WINDOWS_1252_PORTUGUESE_MODEL),
602 SingleByteCharSetProber(CP852_ROMANIAN_MODEL),
603 SingleByteCharSetProber(ISO_8859_16_ROMANIAN_MODEL),
604 SingleByteCharSetProber(ISO_8859_2_ROMANIAN_MODEL),
605 SingleByteCharSetProber(MACLATIN2_ROMANIAN_MODEL),
606 SingleByteCharSetProber(WINDOWS_1250_ROMANIAN_MODEL),
607 SingleByteCharSetProber(CP855_RUSSIAN_MODEL),
608 SingleByteCharSetProber(CP866_RUSSIAN_MODEL),
609 SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
610 SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
611 SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
612 SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
613 SingleByteCharSetProber(CP855_SERBIAN_MODEL),
614 SingleByteCharSetProber(ISO_8859_5_SERBIAN_MODEL),
615 SingleByteCharSetProber(MACCYRILLIC_SERBIAN_MODEL),
616 SingleByteCharSetProber(WINDOWS_1251_SERBIAN_MODEL),
617 SingleByteCharSetProber(ISO_8859_14_SCOTTISH_GAELIC_MODEL),
618 SingleByteCharSetProber(CP037_SCOTTISH_GAELIC_MODEL),
619 SingleByteCharSetProber(CP500_SCOTTISH_GAELIC_MODEL),
620 SingleByteCharSetProber(CP852_SLOVAK_MODEL),
621 SingleByteCharSetProber(ISO_8859_16_SLOVAK_MODEL),
622 SingleByteCharSetProber(ISO_8859_2_SLOVAK_MODEL),
623 SingleByteCharSetProber(MACLATIN2_SLOVAK_MODEL),
624 SingleByteCharSetProber(WINDOWS_1250_SLOVAK_MODEL),
625 SingleByteCharSetProber(CP852_SLOVENE_MODEL),
626 SingleByteCharSetProber(ISO_8859_16_SLOVENE_MODEL),
627 SingleByteCharSetProber(ISO_8859_2_SLOVENE_MODEL),
628 SingleByteCharSetProber(MACLATIN2_SLOVENE_MODEL),
629 SingleByteCharSetProber(WINDOWS_1250_SLOVENE_MODEL),
630 SingleByteCharSetProber(CP037_SPANISH_MODEL),
631 SingleByteCharSetProber(CP500_SPANISH_MODEL),
632 SingleByteCharSetProber(CP850_SPANISH_MODEL),
633 SingleByteCharSetProber(CP858_SPANISH_MODEL),
634 SingleByteCharSetProber(ISO_8859_15_SPANISH_MODEL),
635 SingleByteCharSetProber(ISO_8859_1_SPANISH_MODEL),
636 SingleByteCharSetProber(MACROMAN_SPANISH_MODEL),
637 SingleByteCharSetProber(WINDOWS_1252_SPANISH_MODEL),
638 SingleByteCharSetProber(CP037_SWEDISH_MODEL),
639 SingleByteCharSetProber(CP500_SWEDISH_MODEL),
640 SingleByteCharSetProber(CP850_SWEDISH_MODEL),
641 SingleByteCharSetProber(CP858_SWEDISH_MODEL),
642 SingleByteCharSetProber(ISO_8859_15_SWEDISH_MODEL),
643 SingleByteCharSetProber(ISO_8859_1_SWEDISH_MODEL),
644 SingleByteCharSetProber(MACROMAN_SWEDISH_MODEL),
645 SingleByteCharSetProber(WINDOWS_1252_SWEDISH_MODEL),
646 SingleByteCharSetProber(KOI8_T_TAJIK_MODEL),
647 SingleByteCharSetProber(CP874_THAI_MODEL),
648 SingleByteCharSetProber(ISO_8859_11_THAI_MODEL),
649 SingleByteCharSetProber(TIS_620_THAI_MODEL),
650 SingleByteCharSetProber(CP1026_TURKISH_MODEL),
651 SingleByteCharSetProber(CP857_TURKISH_MODEL),
652 SingleByteCharSetProber(ISO_8859_3_TURKISH_MODEL),
653 SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
654 SingleByteCharSetProber(MACTURKISH_TURKISH_MODEL),
655 SingleByteCharSetProber(WINDOWS_1254_TURKISH_MODEL),
656 SingleByteCharSetProber(CP1125_UKRAINIAN_MODEL),
657 SingleByteCharSetProber(ISO_8859_5_UKRAINIAN_MODEL),
658 SingleByteCharSetProber(KOI8_U_UKRAINIAN_MODEL),
659 SingleByteCharSetProber(MACCYRILLIC_UKRAINIAN_MODEL),
660 SingleByteCharSetProber(WINDOWS_1251_UKRAINIAN_MODEL),
661 SingleByteCharSetProber(ISO_8859_14_WELSH_MODEL),
662 SingleByteCharSetProber(CP037_WELSH_MODEL),
663 SingleByteCharSetProber(CP500_WELSH_MODEL),
664 SingleByteCharSetProber(WINDOWS_1258_VIETNAMESE_MODEL),
665 ]
666
667 # Filter probers based on encoding era and language
668 self.probers = self._filter_probers(self.probers)
669 self.reset()
670
671 def reset(self) -> None:
672 super().reset()
673 self._has_win_bytes = False
674 self._has_mac_latin_letter_pattern = False
675 self._has_mac_cyrillic_letter_pattern = False
676 self._has_euro_sign = False
677 self._input_bytes = bytearray()
678
679 def feed(self, byte_str: Union[bytes, bytearray]) -> "ProbingState":
680 # Track byte patterns for heuristics
681 self._input_bytes.extend(byte_str)
682
683 # Detect byte patterns (only check new bytes for efficiency)
684 if WIN_BYTE_DETECTOR.search(byte_str):
685 self._has_win_bytes = True
686 if MAC_LETTER_IN_WORD_DETECTOR.search(
687 byte_str
688 ) or MAC_LATIN_ONLY_LETTER_DETECTOR.search(byte_str):
689 self._has_mac_latin_letter_pattern = True
690 if MAC_CYRILLIC_ONLY_LETTER_DETECTOR.search(byte_str):
691 self._has_mac_cyrillic_letter_pattern = True
692 if EURO_SIGN_DETECTOR.search(byte_str):
693 self._has_euro_sign = True
694
695 # Call parent feed method
696 return super().feed(byte_str)
697
698 def get_confidence(self) -> float:
699 # Get base confidence from parent
700 base_confidence = super().get_confidence()
701
702 # If no best prober yet, return base confidence
703 if not self._best_guess_prober:
704 return base_confidence
705
706 # Apply heuristics to disambiguate confused encodings
707 charset_name = self._best_guess_prober.charset_name
708 if not charset_name:
709 return base_confidence
710
711 confidence = base_confidence
712 lower_charset_name = charset_name.lower()
713
714 # Build alternatives dict: best prober for each charset (excluding winner)
715 alternatives = {}
716 for prober in self.probers:
717 if not prober.active or prober == self._best_guess_prober:
718 continue
719 alt_name = (prober.charset_name or "").lower()
720 alt_conf = prober.get_confidence()
721 if alt_name not in alternatives or alt_conf > alternatives[alt_name][1]:
722 alternatives[alt_name] = (prober, alt_conf)
723
724 # Heuristic 1: Mac/Windows/ISO disambiguation for LATIN encodings
725 is_latin_family = lower_charset_name in CONFUSED_LATIN_ENCODINGS
726
727 if is_latin_family and lower_charset_name == "macroman":
728 # MacRoman wins but no Mac patterns → prefer ISO/Windows
729 # If we have Win bytes, prefer Windows encodings specifically
730 if not self._has_mac_latin_letter_pattern:
731 alt_names = (
732 ("windows-1252", "iso-8859-1", "iso-8859-15")
733 if self._has_win_bytes
734 else ("iso-8859-1", "windows-1252", "iso-8859-15")
735 )
736 for alt_name in alt_names:
737 if alt_name in alternatives:
738 prober, alt_conf = alternatives[alt_name]
739 if alt_conf >= confidence * 0.995: # Within 0.5%
740 self._best_guess_prober = prober
741 return alt_conf
742
743 # Cross-family Mac vs Windows disambiguation
744 # If ANY Mac encoding wins but we have Windows bytes and no Mac patterns,
745 # prefer any close Windows alternative (even from different language family)
746 # This handles cases where MacRoman/MacLatin2/etc wins against text in a different family
747 if (
748 lower_charset_name.startswith("mac")
749 and self._has_win_bytes
750 and not self._has_mac_latin_letter_pattern
751 ):
752 # Look for Windows alternatives
753 win_alternatives = [
754 (name, prober, conf)
755 for name, (prober, conf) in alternatives.items()
756 if name.startswith("windows-")
757 ]
758 if win_alternatives:
759 # Sort by confidence and take the best Windows alternative
760 win_alternatives.sort(key=lambda x: -x[2])
761 best_win_name, best_win_prober, best_win_conf = win_alternatives[0]
762 if best_win_conf >= confidence * 0.995: # Within 0.5%
763 self._best_guess_prober = best_win_prober
764 return best_win_conf
765
766 elif lower_charset_name.startswith("iso-8859"):
767 is_latin_iso = lower_charset_name in CONFUSED_LATIN_ENCODINGS
768
769 # ISO wins and has Windows bytes → switch to Windows
770 if self._has_win_bytes:
771 should_switch = True
772 # But check if Mac is close with Mac patterns (Latin only)
773 if is_latin_iso and self._has_mac_latin_letter_pattern:
774 for mac_name in alternatives:
775 if (
776 mac_name.startswith("mac")
777 and mac_name in CONFUSED_LATIN_ENCODINGS
778 ):
779 _, mac_conf = alternatives[mac_name]
780 if mac_conf >= confidence * 0.995:
781 should_switch = False
782 break
783
784 if should_switch:
785 win_name = ISO_WIN_MAP.get(lower_charset_name)
786 if win_name and win_name.lower() in alternatives:
787 prober, alt_conf = alternatives[win_name.lower()]
788 self._best_guess_prober = prober
789 return alt_conf
790
791 # ISO-8859-1 with Euro sign → prefer ISO-8859-15
792 if lower_charset_name == "iso-8859-1" and self._has_euro_sign:
793 if "iso-8859-15" in alternatives:
794 prober, alt_conf = alternatives["iso-8859-15"]
795 if alt_conf >= confidence * 0.99:
796 self._best_guess_prober = prober
797 return alt_conf
798
799 # Heuristic 2: Euro sign detection for Latin encodings
800 if self._has_euro_sign and "iso-8859-15" in alternatives:
801 is_latin_encoding = lower_charset_name in CONFUSED_LATIN_ENCODINGS
802 if is_latin_encoding:
803 prober, alt_conf = alternatives["iso-8859-15"]
804 if alt_conf >= confidence * 0.99:
805 self._best_guess_prober = prober
806 return alt_conf
807
808 # Heuristic 3: Prefer Mac over Windows/ISO when Mac Latin letter patterns present
809 if self._has_mac_latin_letter_pattern:
810 mac_alternatives = [
811 name
812 for name in alternatives
813 if name.startswith("mac") and name in CONFUSED_LATIN_ENCODINGS
814 ]
815 for mac_name in mac_alternatives:
816 prober, mac_conf = alternatives[mac_name]
817 is_latin_win_or_iso = (
818 lower_charset_name in CONFUSED_LATIN_ENCODINGS
819 and not lower_charset_name.startswith("mac")
820 )
821 if is_latin_win_or_iso and mac_conf >= confidence * 0.90:
822 self._best_guess_prober = prober
823 return mac_conf
824
825 # Heuristic 4: Mac/Windows/ISO disambiguation for CYRILLIC encodings
826 is_cyrillic_family = lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS
827
828 if is_cyrillic_family and lower_charset_name == "maccyrillic":
829 # MacCyrillic wins but no Mac Cyrillic patterns → prefer Windows/ISO
830 if not self._has_mac_cyrillic_letter_pattern and not self._has_win_bytes:
831 for alt_name in ("windows-1251", "iso-8859-5"):
832 if alt_name in alternatives:
833 prober, alt_conf = alternatives[alt_name]
834 if alt_conf >= confidence * 0.995:
835 self._best_guess_prober = prober
836 return alt_conf
837
838 elif is_cyrillic_family and lower_charset_name == "iso-8859-5":
839 # ISO-8859-5 wins and has Windows bytes → switch to Windows-1251
840 if self._has_win_bytes:
841 should_switch = True
842 if (
843 self._has_mac_cyrillic_letter_pattern
844 and "maccyrillic" in alternatives
845 ):
846 _, mac_conf = alternatives["maccyrillic"]
847 if mac_conf >= confidence * 0.995:
848 should_switch = False
849
850 if should_switch and "windows-1251" in alternatives:
851 prober, alt_conf = alternatives["windows-1251"]
852 self._best_guess_prober = prober
853 return alt_conf
854
855 # Heuristic 5: Prefer MacCyrillic when Mac Cyrillic letter patterns present
856 if self._has_mac_cyrillic_letter_pattern and "maccyrillic" in alternatives:
857 prober, mac_conf = alternatives["maccyrillic"]
858 is_cyrillic_win_or_iso = (
859 lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS
860 and lower_charset_name != "maccyrillic"
861 )
862 if is_cyrillic_win_or_iso and mac_conf >= confidence * 0.90:
863 self._best_guess_prober = prober
864 return mac_conf
865
866 # Heuristic 6: Mac/Windows/ISO disambiguation for CENTRAL EUROPEAN encodings
867 is_central_european_family = (
868 lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS
869 )
870
871 if is_central_european_family and lower_charset_name == "maclatin2":
872 # MacLatin2 wins but no Mac patterns → prefer Windows/ISO
873 if not self._has_mac_latin_letter_pattern:
874 alt_names = (
875 ("windows-1250", "iso-8859-2")
876 if self._has_win_bytes
877 else ("iso-8859-2", "windows-1250")
878 )
879 for alt_name in alt_names:
880 if alt_name in alternatives:
881 prober, alt_conf = alternatives[alt_name]
882 if alt_conf >= confidence * 0.995:
883 self._best_guess_prober = prober
884 return alt_conf
885
886 elif is_central_european_family and lower_charset_name == "iso-8859-2":
887 # ISO-8859-2 wins and has Windows bytes → switch to Windows-1250
888 if self._has_win_bytes:
889 should_switch = True
890 if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives:
891 _, mac_conf = alternatives["maclatin2"]
892 if mac_conf >= confidence * 0.995:
893 should_switch = False
894
895 if should_switch and "windows-1250" in alternatives:
896 prober, alt_conf = alternatives["windows-1250"]
897 self._best_guess_prober = prober
898 return alt_conf
899
900 # Heuristic 7: Prefer MacLatin2 when Mac Latin letter patterns present
901 if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives:
902 prober, mac_conf = alternatives["maclatin2"]
903 is_central_european_win_or_iso = (
904 lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS
905 and lower_charset_name != "maclatin2"
906 )
907 if is_central_european_win_or_iso and mac_conf >= confidence * 0.90:
908 self._best_guess_prober = prober
909 return mac_conf
910
911 return confidence