Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/sbcsgroupprober.py: 61%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

221 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, see 

25# <https://www.gnu.org/licenses/>. 

26######################### END LICENSE BLOCK ######################### 

27 

28import re 

29from typing import Union 

30 

31from .charsetgroupprober import CharSetGroupProber 

32from .enums import EncodingEra, LanguageFilter, ProbingState 

33from .hebrewprober import HebrewProber 

34from .langarabicmodel import ( 

35 CP720_ARABIC_MODEL, 

36 CP864_ARABIC_MODEL, 

37 ISO_8859_6_ARABIC_MODEL, 

38 WINDOWS_1256_ARABIC_MODEL, 

39) 

40from .langbelarusianmodel import ( 

41 CP866_BELARUSIAN_MODEL, 

42 ISO_8859_5_BELARUSIAN_MODEL, 

43 MACCYRILLIC_BELARUSIAN_MODEL, 

44 WINDOWS_1251_BELARUSIAN_MODEL, 

45) 

46from .langbretonmodel import ( 

47 CP037_BRETON_MODEL, 

48 CP500_BRETON_MODEL, 

49 ISO_8859_14_BRETON_MODEL, 

50) 

51from .langbulgarianmodel import ( 

52 CP855_BULGARIAN_MODEL, 

53 ISO_8859_5_BULGARIAN_MODEL, 

54 MACCYRILLIC_BULGARIAN_MODEL, 

55 WINDOWS_1251_BULGARIAN_MODEL, 

56) 

57from .langcroatianmodel import ( 

58 CP852_CROATIAN_MODEL, 

59 ISO_8859_2_CROATIAN_MODEL, 

60 ISO_8859_16_CROATIAN_MODEL, 

61 MACLATIN2_CROATIAN_MODEL, 

62 WINDOWS_1250_CROATIAN_MODEL, 

63) 

64from .langczechmodel import ( 

65 ISO_8859_2_CZECH_MODEL, 

66 WINDOWS_1250_CZECH_MODEL, 

67) 

68from .langdanishmodel import ( 

69 CP037_DANISH_MODEL, 

70 CP500_DANISH_MODEL, 

71 CP850_DANISH_MODEL, 

72 CP858_DANISH_MODEL, 

73 CP865_DANISH_MODEL, 

74 ISO_8859_1_DANISH_MODEL, 

75 ISO_8859_15_DANISH_MODEL, 

76 MACROMAN_DANISH_MODEL, 

77 WINDOWS_1252_DANISH_MODEL, 

78) 

79from .langdutchmodel import ( 

80 CP037_DUTCH_MODEL, 

81 CP500_DUTCH_MODEL, 

82 CP850_DUTCH_MODEL, 

83 CP858_DUTCH_MODEL, 

84 ISO_8859_1_DUTCH_MODEL, 

85 ISO_8859_15_DUTCH_MODEL, 

86 MACROMAN_DUTCH_MODEL, 

87 WINDOWS_1252_DUTCH_MODEL, 

88) 

89from .langenglishmodel import ( 

90 CP037_ENGLISH_MODEL, 

91 CP437_ENGLISH_MODEL, 

92 CP500_ENGLISH_MODEL, 

93 CP850_ENGLISH_MODEL, 

94 CP858_ENGLISH_MODEL, 

95 ISO_8859_1_ENGLISH_MODEL, 

96 ISO_8859_15_ENGLISH_MODEL, 

97 MACROMAN_ENGLISH_MODEL, 

98 WINDOWS_1252_ENGLISH_MODEL, 

99) 

100from .langesperantomodel import ISO_8859_3_ESPERANTO_MODEL 

101from .langestonianmodel import ( 

102 CP775_ESTONIAN_MODEL, 

103 ISO_8859_4_ESTONIAN_MODEL, 

104 ISO_8859_13_ESTONIAN_MODEL, 

105 WINDOWS_1257_ESTONIAN_MODEL, 

106) 

107from .langfarsimodel import ( 

108 ISO_8859_6_FARSI_MODEL, 

109 WINDOWS_1256_FARSI_MODEL, 

110) 

111from .langfinnishmodel import ( 

112 CP037_FINNISH_MODEL, 

113 CP500_FINNISH_MODEL, 

114 CP850_FINNISH_MODEL, 

115 CP858_FINNISH_MODEL, 

116 ISO_8859_1_FINNISH_MODEL, 

117 ISO_8859_15_FINNISH_MODEL, 

118 MACROMAN_FINNISH_MODEL, 

119 WINDOWS_1252_FINNISH_MODEL, 

120) 

121from .langfrenchmodel import ( 

122 CP037_FRENCH_MODEL, 

123 CP500_FRENCH_MODEL, 

124 CP850_FRENCH_MODEL, 

125 CP858_FRENCH_MODEL, 

126 CP863_FRENCH_MODEL, 

127 ISO_8859_1_FRENCH_MODEL, 

128 ISO_8859_15_FRENCH_MODEL, 

129 MACROMAN_FRENCH_MODEL, 

130 WINDOWS_1252_FRENCH_MODEL, 

131) 

132from .langgermanmodel import ( 

133 CP037_GERMAN_MODEL, 

134 CP500_GERMAN_MODEL, 

135 CP850_GERMAN_MODEL, 

136 CP858_GERMAN_MODEL, 

137 ISO_8859_1_GERMAN_MODEL, 

138 ISO_8859_15_GERMAN_MODEL, 

139 MACROMAN_GERMAN_MODEL, 

140 WINDOWS_1252_GERMAN_MODEL, 

141) 

142from .langgreekmodel import ( 

143 CP737_GREEK_MODEL, 

144 CP869_GREEK_MODEL, 

145 CP875_GREEK_MODEL, 

146 ISO_8859_7_GREEK_MODEL, 

147 MACGREEK_GREEK_MODEL, 

148 WINDOWS_1253_GREEK_MODEL, 

149) 

150from .langhebrewmodel import ( 

151 CP424_HEBREW_MODEL, 

152 CP856_HEBREW_MODEL, 

153 CP862_HEBREW_MODEL, 

154 ISO_8859_8_HEBREW_MODEL, 

155 WINDOWS_1255_HEBREW_MODEL, 

156) 

157from .langhungarianmodel import ( 

158 CP852_HUNGARIAN_MODEL, 

159 ISO_8859_2_HUNGARIAN_MODEL, 

160 ISO_8859_16_HUNGARIAN_MODEL, 

161 MACLATIN2_HUNGARIAN_MODEL, 

162 WINDOWS_1250_HUNGARIAN_MODEL, 

163) 

164from .langicelandicmodel import ( 

165 CP037_ICELANDIC_MODEL, 

166 CP500_ICELANDIC_MODEL, 

167 CP861_ICELANDIC_MODEL, 

168 ISO_8859_1_ICELANDIC_MODEL, 

169 ISO_8859_10_ICELANDIC_MODEL, 

170 MACICELAND_ICELANDIC_MODEL, 

171) 

172from .langindonesianmodel import ( 

173 CP037_INDONESIAN_MODEL, 

174 CP500_INDONESIAN_MODEL, 

175 ISO_8859_1_INDONESIAN_MODEL, 

176 MACROMAN_INDONESIAN_MODEL, 

177 WINDOWS_1252_INDONESIAN_MODEL, 

178) 

179from .langirishmodel import ( 

180 CP037_IRISH_MODEL, 

181 CP500_IRISH_MODEL, 

182 ISO_8859_14_IRISH_MODEL, 

183) 

184from .langitalianmodel import ( 

185 CP037_ITALIAN_MODEL, 

186 CP500_ITALIAN_MODEL, 

187 CP850_ITALIAN_MODEL, 

188 CP858_ITALIAN_MODEL, 

189 ISO_8859_1_ITALIAN_MODEL, 

190 ISO_8859_15_ITALIAN_MODEL, 

191 MACROMAN_ITALIAN_MODEL, 

192 WINDOWS_1252_ITALIAN_MODEL, 

193) 

194from .langkazakhmodel import ( 

195 KZ1048_KAZAKH_MODEL, 

196 PTCP154_KAZAKH_MODEL, 

197) 

198from .langlatvianmodel import ( 

199 CP775_LATVIAN_MODEL, 

200 ISO_8859_4_LATVIAN_MODEL, 

201 ISO_8859_13_LATVIAN_MODEL, 

202 WINDOWS_1257_LATVIAN_MODEL, 

203) 

204from .langlithuanianmodel import ( 

205 CP775_LITHUANIAN_MODEL, 

206 ISO_8859_4_LITHUANIAN_MODEL, 

207 ISO_8859_13_LITHUANIAN_MODEL, 

208 WINDOWS_1257_LITHUANIAN_MODEL, 

209) 

210from .langmacedonianmodel import ( 

211 CP855_MACEDONIAN_MODEL, 

212 ISO_8859_5_MACEDONIAN_MODEL, 

213 MACCYRILLIC_MACEDONIAN_MODEL, 

214 WINDOWS_1251_MACEDONIAN_MODEL, 

215) 

216from .langmalaymodel import ( 

217 CP037_MALAY_MODEL, 

218 CP500_MALAY_MODEL, 

219 ISO_8859_1_MALAY_MODEL, 

220 MACROMAN_MALAY_MODEL, 

221 WINDOWS_1252_MALAY_MODEL, 

222) 

223from .langmaltesemodel import ISO_8859_3_MALTESE_MODEL 

224from .langnorwegianmodel import ( 

225 CP037_NORWEGIAN_MODEL, 

226 CP500_NORWEGIAN_MODEL, 

227 CP850_NORWEGIAN_MODEL, 

228 CP858_NORWEGIAN_MODEL, 

229 CP865_NORWEGIAN_MODEL, 

230 ISO_8859_1_NORWEGIAN_MODEL, 

231 ISO_8859_15_NORWEGIAN_MODEL, 

232 MACROMAN_NORWEGIAN_MODEL, 

233 WINDOWS_1252_NORWEGIAN_MODEL, 

234) 

235from .langpolishmodel import ( 

236 CP852_POLISH_MODEL, 

237 ISO_8859_2_POLISH_MODEL, 

238 ISO_8859_16_POLISH_MODEL, 

239 MACLATIN2_POLISH_MODEL, 

240 WINDOWS_1250_POLISH_MODEL, 

241) 

242from .langportuguesemodel import ( 

243 CP037_PORTUGUESE_MODEL, 

244 CP500_PORTUGUESE_MODEL, 

245 CP850_PORTUGUESE_MODEL, 

246 CP858_PORTUGUESE_MODEL, 

247 CP860_PORTUGUESE_MODEL, 

248 ISO_8859_1_PORTUGUESE_MODEL, 

249 ISO_8859_15_PORTUGUESE_MODEL, 

250 MACROMAN_PORTUGUESE_MODEL, 

251 WINDOWS_1252_PORTUGUESE_MODEL, 

252) 

253from .langromanianmodel import ( 

254 CP852_ROMANIAN_MODEL, 

255 ISO_8859_2_ROMANIAN_MODEL, 

256 ISO_8859_16_ROMANIAN_MODEL, 

257 MACLATIN2_ROMANIAN_MODEL, 

258 WINDOWS_1250_ROMANIAN_MODEL, 

259) 

260from .langrussianmodel import ( 

261 CP855_RUSSIAN_MODEL, 

262 CP866_RUSSIAN_MODEL, 

263 ISO_8859_5_RUSSIAN_MODEL, 

264 KOI8_R_RUSSIAN_MODEL, 

265 MACCYRILLIC_RUSSIAN_MODEL, 

266 WINDOWS_1251_RUSSIAN_MODEL, 

267) 

268from .langscottishgaelicmodel import ( 

269 CP037_SCOTTISH_GAELIC_MODEL, 

270 CP500_SCOTTISH_GAELIC_MODEL, 

271 ISO_8859_14_SCOTTISH_GAELIC_MODEL, 

272) 

273from .langserbianmodel import ( 

274 CP855_SERBIAN_MODEL, 

275 ISO_8859_5_SERBIAN_MODEL, 

276 MACCYRILLIC_SERBIAN_MODEL, 

277 WINDOWS_1251_SERBIAN_MODEL, 

278) 

279from .langslovakmodel import ( 

280 CP852_SLOVAK_MODEL, 

281 ISO_8859_2_SLOVAK_MODEL, 

282 ISO_8859_16_SLOVAK_MODEL, 

283 MACLATIN2_SLOVAK_MODEL, 

284 WINDOWS_1250_SLOVAK_MODEL, 

285) 

286from .langslovenemodel import ( 

287 CP852_SLOVENE_MODEL, 

288 ISO_8859_2_SLOVENE_MODEL, 

289 ISO_8859_16_SLOVENE_MODEL, 

290 MACLATIN2_SLOVENE_MODEL, 

291 WINDOWS_1250_SLOVENE_MODEL, 

292) 

293from .langspanishmodel import ( 

294 CP037_SPANISH_MODEL, 

295 CP500_SPANISH_MODEL, 

296 CP850_SPANISH_MODEL, 

297 CP858_SPANISH_MODEL, 

298 ISO_8859_1_SPANISH_MODEL, 

299 ISO_8859_15_SPANISH_MODEL, 

300 MACROMAN_SPANISH_MODEL, 

301 WINDOWS_1252_SPANISH_MODEL, 

302) 

303from .langswedishmodel import ( 

304 CP037_SWEDISH_MODEL, 

305 CP500_SWEDISH_MODEL, 

306 CP850_SWEDISH_MODEL, 

307 CP858_SWEDISH_MODEL, 

308 ISO_8859_1_SWEDISH_MODEL, 

309 ISO_8859_15_SWEDISH_MODEL, 

310 MACROMAN_SWEDISH_MODEL, 

311 WINDOWS_1252_SWEDISH_MODEL, 

312) 

313from .langtajikmodel import KOI8_T_TAJIK_MODEL 

314from .langthaimodel import ( 

315 CP874_THAI_MODEL, 

316 ISO_8859_11_THAI_MODEL, 

317 TIS_620_THAI_MODEL, 

318) 

319from .langturkishmodel import ( 

320 CP857_TURKISH_MODEL, 

321 CP1026_TURKISH_MODEL, 

322 ISO_8859_3_TURKISH_MODEL, 

323 ISO_8859_9_TURKISH_MODEL, 

324 MACTURKISH_TURKISH_MODEL, 

325 WINDOWS_1254_TURKISH_MODEL, 

326) 

327from .langukrainianmodel import ( 

328 CP1125_UKRAINIAN_MODEL, 

329 ISO_8859_5_UKRAINIAN_MODEL, 

330 KOI8_U_UKRAINIAN_MODEL, 

331 MACCYRILLIC_UKRAINIAN_MODEL, 

332 WINDOWS_1251_UKRAINIAN_MODEL, 

333) 

334from .langvietnamesemodel import WINDOWS_1258_VIETNAMESE_MODEL 

335from .langwelshmodel import ( 

336 CP037_WELSH_MODEL, 

337 CP500_WELSH_MODEL, 

338 ISO_8859_14_WELSH_MODEL, 

339) 

340from .sbcharsetprober import SingleByteCharSetProber 

341 

342# Byte pattern detectors for single-byte encoding disambiguation 

343# Bytes in 0x80-0x9F range have different meanings in different encoding families: 

344# - Windows encodings: Smart quotes, dashes, currency symbols (printable punctuation) 

345# - Mac encodings: Accented letters and diacriticals (printable letters) 

346# - ISO-8859-x: Control characters (C1 control codes, mostly unprintable) 

347 

348# Detect any byte in the Windows/Mac range 

349WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]") 

350 

351# Detect Mac-only letter bytes for Latin encodings (letters in Mac, control/punct in Win/ISO) 

352MAC_LATIN_ONLY_LETTER_DETECTOR = re.compile(b"[\x81\x8d\x8f\x90\x92\x9d]") 

353 

354# Detect MacCyrillic-only letter bytes (Cyrillic letters in Mac, punctuation in Windows-1251) 

355MAC_CYRILLIC_ONLY_LETTER_DETECTOR = re.compile( 

356 b"[\x82\x84\x85\x86\x87\x88\x89\x8b\x91\x92\x93\x94\x95\x96\x97\x99\x9b]" 

357) 

358 

359# Detect Mac letter bytes appearing between word characters (suggests Mac encoding) 

360MAC_LETTER_IN_WORD_DETECTOR = re.compile(b"[a-zA-Z][\x80-\x9f][a-zA-Z]") 

361 

362# Detect Euro sign (0xA4 in ISO-8859-15, but generic currency ¤ in ISO-8859-1) 

363EURO_SIGN_DETECTOR = re.compile(b"\xa4") 

364 

365# Latin encodings where Mac=letters and Windows=punctuation in 0x80-0x9F 

366CONFUSED_LATIN_ENCODINGS = frozenset({ 

367 "macroman", 

368 "windows-1252", 

369 "iso-8859-1", 

370 "iso-8859-15", 

371 "macgreek", 

372 "windows-1253", 

373 "iso-8859-7", 

374 "macturkish", 

375 "windows-1254", 

376 "iso-8859-9", 

377 "iso-8859-3", # Turkish/Maltese/Esperanto (also works with MacTurkish/Win-1254) 

378 "maciceland", 

379 "iso-8859-10", 

380 "iso-8859-14", 

381}) 

382 

383# Central European encodings where Mac=letters and Windows=punctuation in 0x80-0x9F 

384CONFUSED_CENTRAL_EUROPEAN_ENCODINGS = frozenset({ 

385 "maclatin2", 

386 "windows-1250", 

387 "iso-8859-2", 

388 "iso-8859-16", # Southeast European/Romanian (close to Latin-2) 

389}) 

390 

391# Cyrillic encodings where Mac=letters and Windows=punctuation in 0x80-0x9F 

392CONFUSED_CYRILLIC_ENCODINGS = frozenset({ 

393 "maccyrillic", 

394 "windows-1251", 

395 "iso-8859-5", 

396}) 

397 

398# Map ISO encodings to their Windows equivalents 

399ISO_WIN_MAP = { 

400 "iso-8859-1": "Windows-1252", 

401 "iso-8859-2": "Windows-1250", 

402 "iso-8859-5": "Windows-1251", 

403 "iso-8859-6": "Windows-1256", 

404 "iso-8859-7": "Windows-1253", 

405 "iso-8859-8": "Windows-1255", 

406 "iso-8859-9": "Windows-1254", 

407 "iso-8859-13": "Windows-1257", 

408} 

409 

410 

411class SBCSGroupProber(CharSetGroupProber): 

412 def __init__( 

413 self, 

414 lang_filter: LanguageFilter = LanguageFilter.ALL, 

415 encoding_era: EncodingEra = EncodingEra.MODERN_WEB, 

416 ) -> None: 

417 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era) 

418 

419 # Initialize byte pattern tracking for disambiguation heuristics 

420 self._has_win_bytes = False 

421 self._has_mac_latin_letter_pattern = False 

422 self._has_mac_cyrillic_letter_pattern = False 

423 self._has_euro_sign = False 

424 self._input_bytes = bytearray() 

425 

426 hebrew_prober = HebrewProber() 

427 logical_hebrew_prober = SingleByteCharSetProber( 

428 WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober 

429 ) 

430 visual_hebrew_prober = SingleByteCharSetProber( 

431 ISO_8859_8_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober 

432 ) 

433 hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) 

434 

435 # TODO: ORDER MATTERS HERE. I changed the order vs what was in master 

436 # and several tests failed that did not before. Some thought 

437 # should be put into the ordering, and we should consider making 

438 # order not matter here, because that is very counter-intuitive. 

439 self.probers = [ 

440 SingleByteCharSetProber(CP720_ARABIC_MODEL), 

441 SingleByteCharSetProber(CP864_ARABIC_MODEL), 

442 SingleByteCharSetProber(ISO_8859_6_ARABIC_MODEL), 

443 SingleByteCharSetProber(WINDOWS_1256_ARABIC_MODEL), 

444 SingleByteCharSetProber(CP866_BELARUSIAN_MODEL), 

445 SingleByteCharSetProber(ISO_8859_5_BELARUSIAN_MODEL), 

446 SingleByteCharSetProber(MACCYRILLIC_BELARUSIAN_MODEL), 

447 SingleByteCharSetProber(WINDOWS_1251_BELARUSIAN_MODEL), 

448 SingleByteCharSetProber(ISO_8859_14_BRETON_MODEL), 

449 SingleByteCharSetProber(CP037_BRETON_MODEL), 

450 SingleByteCharSetProber(CP500_BRETON_MODEL), 

451 SingleByteCharSetProber(CP855_BULGARIAN_MODEL), 

452 SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL), 

453 SingleByteCharSetProber(MACCYRILLIC_BULGARIAN_MODEL), 

454 SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL), 

455 SingleByteCharSetProber(CP852_CROATIAN_MODEL), 

456 SingleByteCharSetProber(ISO_8859_16_CROATIAN_MODEL), 

457 SingleByteCharSetProber(ISO_8859_2_CROATIAN_MODEL), 

458 SingleByteCharSetProber(MACLATIN2_CROATIAN_MODEL), 

459 SingleByteCharSetProber(WINDOWS_1250_CROATIAN_MODEL), 

460 SingleByteCharSetProber(ISO_8859_2_CZECH_MODEL), 

461 SingleByteCharSetProber(WINDOWS_1250_CZECH_MODEL), 

462 SingleByteCharSetProber(CP037_DANISH_MODEL), 

463 SingleByteCharSetProber(CP500_DANISH_MODEL), 

464 SingleByteCharSetProber(CP850_DANISH_MODEL), 

465 SingleByteCharSetProber(CP858_DANISH_MODEL), 

466 SingleByteCharSetProber(CP865_DANISH_MODEL), 

467 SingleByteCharSetProber(ISO_8859_15_DANISH_MODEL), 

468 SingleByteCharSetProber(ISO_8859_1_DANISH_MODEL), 

469 SingleByteCharSetProber(MACROMAN_DANISH_MODEL), 

470 SingleByteCharSetProber(WINDOWS_1252_DANISH_MODEL), 

471 SingleByteCharSetProber(CP037_DUTCH_MODEL), 

472 SingleByteCharSetProber(CP500_DUTCH_MODEL), 

473 SingleByteCharSetProber(CP850_DUTCH_MODEL), 

474 SingleByteCharSetProber(CP858_DUTCH_MODEL), 

475 SingleByteCharSetProber(ISO_8859_15_DUTCH_MODEL), 

476 SingleByteCharSetProber(ISO_8859_1_DUTCH_MODEL), 

477 SingleByteCharSetProber(MACROMAN_DUTCH_MODEL), 

478 SingleByteCharSetProber(WINDOWS_1252_DUTCH_MODEL), 

479 SingleByteCharSetProber(CP037_ENGLISH_MODEL), 

480 SingleByteCharSetProber(CP437_ENGLISH_MODEL), 

481 SingleByteCharSetProber(CP500_ENGLISH_MODEL), 

482 SingleByteCharSetProber(CP850_ENGLISH_MODEL), 

483 SingleByteCharSetProber(CP858_ENGLISH_MODEL), 

484 SingleByteCharSetProber(ISO_8859_15_ENGLISH_MODEL), 

485 SingleByteCharSetProber(ISO_8859_1_ENGLISH_MODEL), 

486 SingleByteCharSetProber(MACROMAN_ENGLISH_MODEL), 

487 SingleByteCharSetProber(WINDOWS_1252_ENGLISH_MODEL), 

488 SingleByteCharSetProber(ISO_8859_3_ESPERANTO_MODEL), 

489 SingleByteCharSetProber(CP775_ESTONIAN_MODEL), 

490 SingleByteCharSetProber(ISO_8859_13_ESTONIAN_MODEL), 

491 SingleByteCharSetProber(ISO_8859_4_ESTONIAN_MODEL), 

492 SingleByteCharSetProber(WINDOWS_1257_ESTONIAN_MODEL), 

493 SingleByteCharSetProber(ISO_8859_6_FARSI_MODEL), 

494 SingleByteCharSetProber(WINDOWS_1256_FARSI_MODEL), 

495 SingleByteCharSetProber(CP037_FINNISH_MODEL), 

496 SingleByteCharSetProber(CP500_FINNISH_MODEL), 

497 SingleByteCharSetProber(CP850_FINNISH_MODEL), 

498 SingleByteCharSetProber(CP858_FINNISH_MODEL), 

499 SingleByteCharSetProber(ISO_8859_15_FINNISH_MODEL), 

500 SingleByteCharSetProber(ISO_8859_1_FINNISH_MODEL), 

501 SingleByteCharSetProber(MACROMAN_FINNISH_MODEL), 

502 SingleByteCharSetProber(WINDOWS_1252_FINNISH_MODEL), 

503 SingleByteCharSetProber(CP037_FRENCH_MODEL), 

504 SingleByteCharSetProber(CP500_FRENCH_MODEL), 

505 SingleByteCharSetProber(CP850_FRENCH_MODEL), 

506 SingleByteCharSetProber(CP858_FRENCH_MODEL), 

507 SingleByteCharSetProber(CP863_FRENCH_MODEL), 

508 SingleByteCharSetProber(ISO_8859_15_FRENCH_MODEL), 

509 SingleByteCharSetProber(ISO_8859_1_FRENCH_MODEL), 

510 SingleByteCharSetProber(MACROMAN_FRENCH_MODEL), 

511 SingleByteCharSetProber(WINDOWS_1252_FRENCH_MODEL), 

512 SingleByteCharSetProber(CP037_GERMAN_MODEL), 

513 SingleByteCharSetProber(CP500_GERMAN_MODEL), 

514 SingleByteCharSetProber(CP850_GERMAN_MODEL), 

515 SingleByteCharSetProber(CP858_GERMAN_MODEL), 

516 SingleByteCharSetProber(ISO_8859_15_GERMAN_MODEL), 

517 SingleByteCharSetProber(ISO_8859_1_GERMAN_MODEL), 

518 SingleByteCharSetProber(MACROMAN_GERMAN_MODEL), 

519 SingleByteCharSetProber(WINDOWS_1252_GERMAN_MODEL), 

520 SingleByteCharSetProber(CP737_GREEK_MODEL), 

521 SingleByteCharSetProber(CP869_GREEK_MODEL), 

522 SingleByteCharSetProber(CP875_GREEK_MODEL), 

523 SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL), 

524 SingleByteCharSetProber(MACGREEK_GREEK_MODEL), 

525 SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL), 

526 SingleByteCharSetProber(CP424_HEBREW_MODEL, is_reversed=True), 

527 SingleByteCharSetProber(CP856_HEBREW_MODEL, is_reversed=True), 

528 SingleByteCharSetProber(CP862_HEBREW_MODEL, is_reversed=True), 

529 hebrew_prober, 

530 logical_hebrew_prober, 

531 visual_hebrew_prober, 

532 SingleByteCharSetProber(CP852_HUNGARIAN_MODEL), 

533 SingleByteCharSetProber(ISO_8859_16_HUNGARIAN_MODEL), 

534 SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL), 

535 SingleByteCharSetProber(MACLATIN2_HUNGARIAN_MODEL), 

536 SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL), 

537 SingleByteCharSetProber(CP037_ICELANDIC_MODEL), 

538 SingleByteCharSetProber(CP500_ICELANDIC_MODEL), 

539 SingleByteCharSetProber(CP861_ICELANDIC_MODEL), 

540 SingleByteCharSetProber(ISO_8859_10_ICELANDIC_MODEL), 

541 SingleByteCharSetProber(ISO_8859_1_ICELANDIC_MODEL), 

542 SingleByteCharSetProber(MACICELAND_ICELANDIC_MODEL), 

543 SingleByteCharSetProber(CP037_INDONESIAN_MODEL), 

544 SingleByteCharSetProber(CP500_INDONESIAN_MODEL), 

545 SingleByteCharSetProber(ISO_8859_1_INDONESIAN_MODEL), 

546 SingleByteCharSetProber(MACROMAN_INDONESIAN_MODEL), 

547 SingleByteCharSetProber(WINDOWS_1252_INDONESIAN_MODEL), 

548 SingleByteCharSetProber(ISO_8859_14_IRISH_MODEL), 

549 SingleByteCharSetProber(CP037_IRISH_MODEL), 

550 SingleByteCharSetProber(CP500_IRISH_MODEL), 

551 SingleByteCharSetProber(CP037_ITALIAN_MODEL), 

552 SingleByteCharSetProber(CP500_ITALIAN_MODEL), 

553 SingleByteCharSetProber(CP850_ITALIAN_MODEL), 

554 SingleByteCharSetProber(CP858_ITALIAN_MODEL), 

555 SingleByteCharSetProber(ISO_8859_15_ITALIAN_MODEL), 

556 SingleByteCharSetProber(ISO_8859_1_ITALIAN_MODEL), 

557 SingleByteCharSetProber(MACROMAN_ITALIAN_MODEL), 

558 SingleByteCharSetProber(WINDOWS_1252_ITALIAN_MODEL), 

559 SingleByteCharSetProber(KZ1048_KAZAKH_MODEL), 

560 SingleByteCharSetProber(PTCP154_KAZAKH_MODEL), 

561 SingleByteCharSetProber(CP775_LATVIAN_MODEL), 

562 SingleByteCharSetProber(ISO_8859_13_LATVIAN_MODEL), 

563 SingleByteCharSetProber(ISO_8859_4_LATVIAN_MODEL), 

564 SingleByteCharSetProber(WINDOWS_1257_LATVIAN_MODEL), 

565 SingleByteCharSetProber(CP775_LITHUANIAN_MODEL), 

566 SingleByteCharSetProber(ISO_8859_13_LITHUANIAN_MODEL), 

567 SingleByteCharSetProber(ISO_8859_4_LITHUANIAN_MODEL), 

568 SingleByteCharSetProber(WINDOWS_1257_LITHUANIAN_MODEL), 

569 SingleByteCharSetProber(CP855_MACEDONIAN_MODEL), 

570 SingleByteCharSetProber(ISO_8859_5_MACEDONIAN_MODEL), 

571 SingleByteCharSetProber(MACCYRILLIC_MACEDONIAN_MODEL), 

572 SingleByteCharSetProber(WINDOWS_1251_MACEDONIAN_MODEL), 

573 SingleByteCharSetProber(CP037_MALAY_MODEL), 

574 SingleByteCharSetProber(CP500_MALAY_MODEL), 

575 SingleByteCharSetProber(ISO_8859_1_MALAY_MODEL), 

576 SingleByteCharSetProber(MACROMAN_MALAY_MODEL), 

577 SingleByteCharSetProber(WINDOWS_1252_MALAY_MODEL), 

578 SingleByteCharSetProber(ISO_8859_3_MALTESE_MODEL), 

579 SingleByteCharSetProber(CP037_NORWEGIAN_MODEL), 

580 SingleByteCharSetProber(CP500_NORWEGIAN_MODEL), 

581 SingleByteCharSetProber(CP850_NORWEGIAN_MODEL), 

582 SingleByteCharSetProber(CP858_NORWEGIAN_MODEL), 

583 SingleByteCharSetProber(CP865_NORWEGIAN_MODEL), 

584 SingleByteCharSetProber(ISO_8859_15_NORWEGIAN_MODEL), 

585 SingleByteCharSetProber(ISO_8859_1_NORWEGIAN_MODEL), 

586 SingleByteCharSetProber(MACROMAN_NORWEGIAN_MODEL), 

587 SingleByteCharSetProber(WINDOWS_1252_NORWEGIAN_MODEL), 

588 SingleByteCharSetProber(CP852_POLISH_MODEL), 

589 SingleByteCharSetProber(ISO_8859_16_POLISH_MODEL), 

590 SingleByteCharSetProber(ISO_8859_2_POLISH_MODEL), 

591 SingleByteCharSetProber(MACLATIN2_POLISH_MODEL), 

592 SingleByteCharSetProber(WINDOWS_1250_POLISH_MODEL), 

593 SingleByteCharSetProber(CP037_PORTUGUESE_MODEL), 

594 SingleByteCharSetProber(CP500_PORTUGUESE_MODEL), 

595 SingleByteCharSetProber(CP850_PORTUGUESE_MODEL), 

596 SingleByteCharSetProber(CP858_PORTUGUESE_MODEL), 

597 SingleByteCharSetProber(CP860_PORTUGUESE_MODEL), 

598 SingleByteCharSetProber(ISO_8859_15_PORTUGUESE_MODEL), 

599 SingleByteCharSetProber(ISO_8859_1_PORTUGUESE_MODEL), 

600 SingleByteCharSetProber(MACROMAN_PORTUGUESE_MODEL), 

601 SingleByteCharSetProber(WINDOWS_1252_PORTUGUESE_MODEL), 

602 SingleByteCharSetProber(CP852_ROMANIAN_MODEL), 

603 SingleByteCharSetProber(ISO_8859_16_ROMANIAN_MODEL), 

604 SingleByteCharSetProber(ISO_8859_2_ROMANIAN_MODEL), 

605 SingleByteCharSetProber(MACLATIN2_ROMANIAN_MODEL), 

606 SingleByteCharSetProber(WINDOWS_1250_ROMANIAN_MODEL), 

607 SingleByteCharSetProber(CP855_RUSSIAN_MODEL), 

608 SingleByteCharSetProber(CP866_RUSSIAN_MODEL), 

609 SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL), 

610 SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL), 

611 SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL), 

612 SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL), 

613 SingleByteCharSetProber(CP855_SERBIAN_MODEL), 

614 SingleByteCharSetProber(ISO_8859_5_SERBIAN_MODEL), 

615 SingleByteCharSetProber(MACCYRILLIC_SERBIAN_MODEL), 

616 SingleByteCharSetProber(WINDOWS_1251_SERBIAN_MODEL), 

617 SingleByteCharSetProber(ISO_8859_14_SCOTTISH_GAELIC_MODEL), 

618 SingleByteCharSetProber(CP037_SCOTTISH_GAELIC_MODEL), 

619 SingleByteCharSetProber(CP500_SCOTTISH_GAELIC_MODEL), 

620 SingleByteCharSetProber(CP852_SLOVAK_MODEL), 

621 SingleByteCharSetProber(ISO_8859_16_SLOVAK_MODEL), 

622 SingleByteCharSetProber(ISO_8859_2_SLOVAK_MODEL), 

623 SingleByteCharSetProber(MACLATIN2_SLOVAK_MODEL), 

624 SingleByteCharSetProber(WINDOWS_1250_SLOVAK_MODEL), 

625 SingleByteCharSetProber(CP852_SLOVENE_MODEL), 

626 SingleByteCharSetProber(ISO_8859_16_SLOVENE_MODEL), 

627 SingleByteCharSetProber(ISO_8859_2_SLOVENE_MODEL), 

628 SingleByteCharSetProber(MACLATIN2_SLOVENE_MODEL), 

629 SingleByteCharSetProber(WINDOWS_1250_SLOVENE_MODEL), 

630 SingleByteCharSetProber(CP037_SPANISH_MODEL), 

631 SingleByteCharSetProber(CP500_SPANISH_MODEL), 

632 SingleByteCharSetProber(CP850_SPANISH_MODEL), 

633 SingleByteCharSetProber(CP858_SPANISH_MODEL), 

634 SingleByteCharSetProber(ISO_8859_15_SPANISH_MODEL), 

635 SingleByteCharSetProber(ISO_8859_1_SPANISH_MODEL), 

636 SingleByteCharSetProber(MACROMAN_SPANISH_MODEL), 

637 SingleByteCharSetProber(WINDOWS_1252_SPANISH_MODEL), 

638 SingleByteCharSetProber(CP037_SWEDISH_MODEL), 

639 SingleByteCharSetProber(CP500_SWEDISH_MODEL), 

640 SingleByteCharSetProber(CP850_SWEDISH_MODEL), 

641 SingleByteCharSetProber(CP858_SWEDISH_MODEL), 

642 SingleByteCharSetProber(ISO_8859_15_SWEDISH_MODEL), 

643 SingleByteCharSetProber(ISO_8859_1_SWEDISH_MODEL), 

644 SingleByteCharSetProber(MACROMAN_SWEDISH_MODEL), 

645 SingleByteCharSetProber(WINDOWS_1252_SWEDISH_MODEL), 

646 SingleByteCharSetProber(KOI8_T_TAJIK_MODEL), 

647 SingleByteCharSetProber(CP874_THAI_MODEL), 

648 SingleByteCharSetProber(ISO_8859_11_THAI_MODEL), 

649 SingleByteCharSetProber(TIS_620_THAI_MODEL), 

650 SingleByteCharSetProber(CP1026_TURKISH_MODEL), 

651 SingleByteCharSetProber(CP857_TURKISH_MODEL), 

652 SingleByteCharSetProber(ISO_8859_3_TURKISH_MODEL), 

653 SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL), 

654 SingleByteCharSetProber(MACTURKISH_TURKISH_MODEL), 

655 SingleByteCharSetProber(WINDOWS_1254_TURKISH_MODEL), 

656 SingleByteCharSetProber(CP1125_UKRAINIAN_MODEL), 

657 SingleByteCharSetProber(ISO_8859_5_UKRAINIAN_MODEL), 

658 SingleByteCharSetProber(KOI8_U_UKRAINIAN_MODEL), 

659 SingleByteCharSetProber(MACCYRILLIC_UKRAINIAN_MODEL), 

660 SingleByteCharSetProber(WINDOWS_1251_UKRAINIAN_MODEL), 

661 SingleByteCharSetProber(ISO_8859_14_WELSH_MODEL), 

662 SingleByteCharSetProber(CP037_WELSH_MODEL), 

663 SingleByteCharSetProber(CP500_WELSH_MODEL), 

664 SingleByteCharSetProber(WINDOWS_1258_VIETNAMESE_MODEL), 

665 ] 

666 

667 # Filter probers based on encoding era and language 

668 self.probers = self._filter_probers(self.probers) 

669 self.reset() 

670 

671 def reset(self) -> None: 

672 super().reset() 

673 self._has_win_bytes = False 

674 self._has_mac_latin_letter_pattern = False 

675 self._has_mac_cyrillic_letter_pattern = False 

676 self._has_euro_sign = False 

677 self._input_bytes = bytearray() 

678 

679 def feed(self, byte_str: Union[bytes, bytearray]) -> "ProbingState": 

680 # Track byte patterns for heuristics 

681 self._input_bytes.extend(byte_str) 

682 

683 # Detect byte patterns (only check new bytes for efficiency) 

684 if WIN_BYTE_DETECTOR.search(byte_str): 

685 self._has_win_bytes = True 

686 if MAC_LETTER_IN_WORD_DETECTOR.search( 

687 byte_str 

688 ) or MAC_LATIN_ONLY_LETTER_DETECTOR.search(byte_str): 

689 self._has_mac_latin_letter_pattern = True 

690 if MAC_CYRILLIC_ONLY_LETTER_DETECTOR.search(byte_str): 

691 self._has_mac_cyrillic_letter_pattern = True 

692 if EURO_SIGN_DETECTOR.search(byte_str): 

693 self._has_euro_sign = True 

694 

695 # Call parent feed method 

696 return super().feed(byte_str) 

697 

698 def get_confidence(self) -> float: 

699 # Get base confidence from parent 

700 base_confidence = super().get_confidence() 

701 

702 # If no best prober yet, return base confidence 

703 if not self._best_guess_prober: 

704 return base_confidence 

705 

706 # Apply heuristics to disambiguate confused encodings 

707 charset_name = self._best_guess_prober.charset_name 

708 if not charset_name: 

709 return base_confidence 

710 

711 confidence = base_confidence 

712 lower_charset_name = charset_name.lower() 

713 

714 # Build alternatives dict: best prober for each charset (excluding winner) 

715 alternatives = {} 

716 for prober in self.probers: 

717 if not prober.active or prober == self._best_guess_prober: 

718 continue 

719 alt_name = (prober.charset_name or "").lower() 

720 alt_conf = prober.get_confidence() 

721 if alt_name not in alternatives or alt_conf > alternatives[alt_name][1]: 

722 alternatives[alt_name] = (prober, alt_conf) 

723 

724 # Heuristic 1: Mac/Windows/ISO disambiguation for LATIN encodings 

725 is_latin_family = lower_charset_name in CONFUSED_LATIN_ENCODINGS 

726 

727 if is_latin_family and lower_charset_name == "macroman": 

728 # MacRoman wins but no Mac patterns → prefer ISO/Windows 

729 # If we have Win bytes, prefer Windows encodings specifically 

730 if not self._has_mac_latin_letter_pattern: 

731 alt_names = ( 

732 ("windows-1252", "iso-8859-1", "iso-8859-15") 

733 if self._has_win_bytes 

734 else ("iso-8859-1", "windows-1252", "iso-8859-15") 

735 ) 

736 for alt_name in alt_names: 

737 if alt_name in alternatives: 

738 prober, alt_conf = alternatives[alt_name] 

739 if alt_conf >= confidence * 0.995: # Within 0.5% 

740 self._best_guess_prober = prober 

741 return alt_conf 

742 

743 # Cross-family Mac vs Windows disambiguation 

744 # If ANY Mac encoding wins but we have Windows bytes and no Mac patterns, 

745 # prefer any close Windows alternative (even from different language family) 

746 # This handles cases where MacRoman/MacLatin2/etc wins against text in a different family 

747 if ( 

748 lower_charset_name.startswith("mac") 

749 and self._has_win_bytes 

750 and not self._has_mac_latin_letter_pattern 

751 ): 

752 # Look for Windows alternatives 

753 win_alternatives = [ 

754 (name, prober, conf) 

755 for name, (prober, conf) in alternatives.items() 

756 if name.startswith("windows-") 

757 ] 

758 if win_alternatives: 

759 # Sort by confidence and take the best Windows alternative 

760 win_alternatives.sort(key=lambda x: -x[2]) 

761 best_win_name, best_win_prober, best_win_conf = win_alternatives[0] 

762 if best_win_conf >= confidence * 0.995: # Within 0.5% 

763 self._best_guess_prober = best_win_prober 

764 return best_win_conf 

765 

766 elif lower_charset_name.startswith("iso-8859"): 

767 is_latin_iso = lower_charset_name in CONFUSED_LATIN_ENCODINGS 

768 

769 # ISO wins and has Windows bytes → switch to Windows 

770 if self._has_win_bytes: 

771 should_switch = True 

772 # But check if Mac is close with Mac patterns (Latin only) 

773 if is_latin_iso and self._has_mac_latin_letter_pattern: 

774 for mac_name in alternatives: 

775 if ( 

776 mac_name.startswith("mac") 

777 and mac_name in CONFUSED_LATIN_ENCODINGS 

778 ): 

779 _, mac_conf = alternatives[mac_name] 

780 if mac_conf >= confidence * 0.995: 

781 should_switch = False 

782 break 

783 

784 if should_switch: 

785 win_name = ISO_WIN_MAP.get(lower_charset_name) 

786 if win_name and win_name.lower() in alternatives: 

787 prober, alt_conf = alternatives[win_name.lower()] 

788 self._best_guess_prober = prober 

789 return alt_conf 

790 

791 # ISO-8859-1 with Euro sign → prefer ISO-8859-15 

792 if lower_charset_name == "iso-8859-1" and self._has_euro_sign: 

793 if "iso-8859-15" in alternatives: 

794 prober, alt_conf = alternatives["iso-8859-15"] 

795 if alt_conf >= confidence * 0.99: 

796 self._best_guess_prober = prober 

797 return alt_conf 

798 

799 # Heuristic 2: Euro sign detection for Latin encodings 

800 if self._has_euro_sign and "iso-8859-15" in alternatives: 

801 is_latin_encoding = lower_charset_name in CONFUSED_LATIN_ENCODINGS 

802 if is_latin_encoding: 

803 prober, alt_conf = alternatives["iso-8859-15"] 

804 if alt_conf >= confidence * 0.99: 

805 self._best_guess_prober = prober 

806 return alt_conf 

807 

808 # Heuristic 3: Prefer Mac over Windows/ISO when Mac Latin letter patterns present 

809 if self._has_mac_latin_letter_pattern: 

810 mac_alternatives = [ 

811 name 

812 for name in alternatives 

813 if name.startswith("mac") and name in CONFUSED_LATIN_ENCODINGS 

814 ] 

815 for mac_name in mac_alternatives: 

816 prober, mac_conf = alternatives[mac_name] 

817 is_latin_win_or_iso = ( 

818 lower_charset_name in CONFUSED_LATIN_ENCODINGS 

819 and not lower_charset_name.startswith("mac") 

820 ) 

821 if is_latin_win_or_iso and mac_conf >= confidence * 0.90: 

822 self._best_guess_prober = prober 

823 return mac_conf 

824 

825 # Heuristic 4: Mac/Windows/ISO disambiguation for CYRILLIC encodings 

826 is_cyrillic_family = lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS 

827 

828 if is_cyrillic_family and lower_charset_name == "maccyrillic": 

829 # MacCyrillic wins but no Mac Cyrillic patterns → prefer Windows/ISO 

830 if not self._has_mac_cyrillic_letter_pattern and not self._has_win_bytes: 

831 for alt_name in ("windows-1251", "iso-8859-5"): 

832 if alt_name in alternatives: 

833 prober, alt_conf = alternatives[alt_name] 

834 if alt_conf >= confidence * 0.995: 

835 self._best_guess_prober = prober 

836 return alt_conf 

837 

838 elif is_cyrillic_family and lower_charset_name == "iso-8859-5": 

839 # ISO-8859-5 wins and has Windows bytes → switch to Windows-1251 

840 if self._has_win_bytes: 

841 should_switch = True 

842 if ( 

843 self._has_mac_cyrillic_letter_pattern 

844 and "maccyrillic" in alternatives 

845 ): 

846 _, mac_conf = alternatives["maccyrillic"] 

847 if mac_conf >= confidence * 0.995: 

848 should_switch = False 

849 

850 if should_switch and "windows-1251" in alternatives: 

851 prober, alt_conf = alternatives["windows-1251"] 

852 self._best_guess_prober = prober 

853 return alt_conf 

854 

855 # Heuristic 5: Prefer MacCyrillic when Mac Cyrillic letter patterns present 

856 if self._has_mac_cyrillic_letter_pattern and "maccyrillic" in alternatives: 

857 prober, mac_conf = alternatives["maccyrillic"] 

858 is_cyrillic_win_or_iso = ( 

859 lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS 

860 and lower_charset_name != "maccyrillic" 

861 ) 

862 if is_cyrillic_win_or_iso and mac_conf >= confidence * 0.90: 

863 self._best_guess_prober = prober 

864 return mac_conf 

865 

866 # Heuristic 6: Mac/Windows/ISO disambiguation for CENTRAL EUROPEAN encodings 

867 is_central_european_family = ( 

868 lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS 

869 ) 

870 

871 if is_central_european_family and lower_charset_name == "maclatin2": 

872 # MacLatin2 wins but no Mac patterns → prefer Windows/ISO 

873 if not self._has_mac_latin_letter_pattern: 

874 alt_names = ( 

875 ("windows-1250", "iso-8859-2") 

876 if self._has_win_bytes 

877 else ("iso-8859-2", "windows-1250") 

878 ) 

879 for alt_name in alt_names: 

880 if alt_name in alternatives: 

881 prober, alt_conf = alternatives[alt_name] 

882 if alt_conf >= confidence * 0.995: 

883 self._best_guess_prober = prober 

884 return alt_conf 

885 

886 elif is_central_european_family and lower_charset_name == "iso-8859-2": 

887 # ISO-8859-2 wins and has Windows bytes → switch to Windows-1250 

888 if self._has_win_bytes: 

889 should_switch = True 

890 if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives: 

891 _, mac_conf = alternatives["maclatin2"] 

892 if mac_conf >= confidence * 0.995: 

893 should_switch = False 

894 

895 if should_switch and "windows-1250" in alternatives: 

896 prober, alt_conf = alternatives["windows-1250"] 

897 self._best_guess_prober = prober 

898 return alt_conf 

899 

900 # Heuristic 7: Prefer MacLatin2 when Mac Latin letter patterns present 

901 if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives: 

902 prober, mac_conf = alternatives["maclatin2"] 

903 is_central_european_win_or_iso = ( 

904 lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS 

905 and lower_charset_name != "maclatin2" 

906 ) 

907 if is_central_european_win_or_iso and mac_conf >= confidence * 0.90: 

908 self._best_guess_prober = prober 

909 return mac_conf 

910 

911 return confidence