Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/md.py: 89%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

533 statements  

1from __future__ import annotations 

2 

3import sys 

4from functools import lru_cache 

5from logging import getLogger 

6 

7if sys.version_info >= (3, 8): 

8 from typing import final 

9else: 

10 try: 

11 from typing_extensions import final 

12 except ImportError: 

13 

14 def final(cls): # type: ignore[misc,no-untyped-def] 

15 return cls 

16 

17 

18from .constant import ( 

19 COMMON_CJK_CHARACTERS, 

20 COMMON_SAFE_ASCII_CHARACTERS, 

21 TRACE, 

22 UNICODE_SECONDARY_RANGE_KEYWORD, 

23 _ACCENTUATED, 

24 _ARABIC, 

25 _ARABIC_ISOLATED_FORM, 

26 _CJK, 

27 _HANGUL, 

28 _HIRAGANA, 

29 _KATAKANA, 

30 _LATIN, 

31 _THAI, 

32) 

33from .utils import ( 

34 _character_flags, 

35 is_emoticon, 

36 is_punctuation, 

37 is_separator, 

38 is_symbol, 

39 remove_accent, 

40 unicode_range, 

41) 

42 

43# Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection. 

44_GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI 

45 

46 

47@final 

48class CharInfo: 

49 """Pre-computed character properties shared across all detectors. 

50 

51 Instantiated once and reused via :meth:`update` on every character 

52 in the hot loop so that redundant calls to str methods 

53 (``isalpha``, ``isupper``, …) and cached utility functions 

54 (``_character_flags``, ``is_punctuation``, …) are avoided when 

55 several plugins need the same information. 

56 """ 

57 

58 __slots__ = ( 

59 "character", 

60 "printable", 

61 "alpha", 

62 "upper", 

63 "lower", 

64 "space", 

65 "digit", 

66 "is_ascii", 

67 "case_variable", 

68 "flags", 

69 "accentuated", 

70 "latin", 

71 "is_cjk", 

72 "is_arabic", 

73 "is_glyph", 

74 "punct", 

75 "sym", 

76 ) 

77 

78 def __init__(self) -> None: 

79 self.character: str = "" 

80 self.printable: bool = False 

81 self.alpha: bool = False 

82 self.upper: bool = False 

83 self.lower: bool = False 

84 self.space: bool = False 

85 self.digit: bool = False 

86 self.is_ascii: bool = False 

87 self.case_variable: bool = False 

88 self.flags: int = 0 

89 self.accentuated: bool = False 

90 self.latin: bool = False 

91 self.is_cjk: bool = False 

92 self.is_arabic: bool = False 

93 self.is_glyph: bool = False 

94 self.punct: bool = False 

95 self.sym: bool = False 

96 

97 def update(self, character: str) -> None: 

98 """Update all properties for *character* (called once per character).""" 

99 self.character = character 

100 

101 # ASCII fast-path: for characters with ord < 128, we can skip 

102 # _character_flags() entirely and derive most properties from ord. 

103 o: int = ord(character) 

104 if o < 128: 

105 self.is_ascii = True 

106 self.accentuated = False 

107 self.is_cjk = False 

108 self.is_arabic = False 

109 self.is_glyph = False 

110 # ASCII alpha: a-z (97-122) or A-Z (65-90) 

111 if 65 <= o <= 90: 

112 # Uppercase ASCII letter 

113 self.alpha = True 

114 self.upper = True 

115 self.lower = False 

116 self.space = False 

117 self.digit = False 

118 self.printable = True 

119 self.case_variable = True 

120 self.flags = _LATIN 

121 self.latin = True 

122 self.punct = False 

123 self.sym = False 

124 elif 97 <= o <= 122: 

125 # Lowercase ASCII letter 

126 self.alpha = True 

127 self.upper = False 

128 self.lower = True 

129 self.space = False 

130 self.digit = False 

131 self.printable = True 

132 self.case_variable = True 

133 self.flags = _LATIN 

134 self.latin = True 

135 self.punct = False 

136 self.sym = False 

137 elif 48 <= o <= 57: 

138 # ASCII digit 0-9 

139 self.alpha = False 

140 self.upper = False 

141 self.lower = False 

142 self.space = False 

143 self.digit = True 

144 self.printable = True 

145 self.case_variable = False 

146 self.flags = 0 

147 self.latin = False 

148 self.punct = False 

149 self.sym = False 

150 elif o == 32 or (9 <= o <= 13): 

151 # Space, tab, newline, etc. 

152 self.alpha = False 

153 self.upper = False 

154 self.lower = False 

155 self.space = True 

156 self.digit = False 

157 self.printable = o == 32 

158 self.case_variable = False 

159 self.flags = 0 

160 self.latin = False 

161 self.punct = False 

162 self.sym = False 

163 else: 

164 # Other ASCII (punctuation, symbols, control chars) 

165 self.printable = character.isprintable() 

166 self.alpha = False 

167 self.upper = False 

168 self.lower = False 

169 self.space = False 

170 self.digit = False 

171 self.case_variable = False 

172 self.flags = 0 

173 self.latin = False 

174 self.punct = is_punctuation(character) if self.printable else False 

175 self.sym = is_symbol(character) if self.printable else False 

176 else: 

177 # Non-ASCII path 

178 self.is_ascii = False 

179 self.printable = character.isprintable() 

180 self.alpha = character.isalpha() 

181 self.upper = character.isupper() 

182 self.lower = character.islower() 

183 self.space = character.isspace() 

184 self.digit = character.isdigit() 

185 self.case_variable = self.lower != self.upper 

186 

187 # Flag-based classification (single unicodedata.name() call, lru-cached) 

188 flags: int 

189 if self.alpha: 

190 flags = _character_flags(character) 

191 else: 

192 flags = 0 

193 self.flags = flags 

194 self.accentuated = bool(flags & _ACCENTUATED) 

195 self.latin = bool(flags & _LATIN) 

196 self.is_cjk = bool(flags & _CJK) 

197 self.is_arabic = bool(flags & _ARABIC) 

198 self.is_glyph = bool(flags & _GLYPH_MASK) 

199 

200 # Eagerly compute punct and sym (avoids property dispatch overhead 

201 # on 300K+ accesses in the hot loop). 

202 self.punct = is_punctuation(character) if self.printable else False 

203 self.sym = is_symbol(character) if self.printable else False 

204 

205 

206class MessDetectorPlugin: 

207 """ 

208 Base abstract class used for mess detection plugins. 

209 All detectors MUST extend and implement given methods. 

210 """ 

211 

212 __slots__ = () 

213 

214 def feed_info(self, character: str, info: CharInfo) -> None: 

215 """ 

216 The main routine to be executed upon character. 

217 Insert the logic in witch the text would be considered chaotic. 

218 """ 

219 raise NotImplementedError # Defensive: 

220 

221 def reset(self) -> None: # Defensive: 

222 """ 

223 Permit to reset the plugin to the initial state. 

224 """ 

225 raise NotImplementedError 

226 

227 @property 

228 def ratio(self) -> float: 

229 """ 

230 Compute the chaos ratio based on what your feed() has seen. 

231 Must NOT be lower than 0.; No restriction gt 0. 

232 """ 

233 raise NotImplementedError # Defensive: 

234 

235 

236@final 

237class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): 

238 __slots__ = ( 

239 "_punctuation_count", 

240 "_symbol_count", 

241 "_character_count", 

242 "_last_printable_char", 

243 "_frenzy_symbol_in_word", 

244 ) 

245 

246 def __init__(self) -> None: 

247 self._punctuation_count: int = 0 

248 self._symbol_count: int = 0 

249 self._character_count: int = 0 

250 

251 self._last_printable_char: str | None = None 

252 self._frenzy_symbol_in_word: bool = False 

253 

254 def feed_info(self, character: str, info: CharInfo) -> None: 

255 """Optimized feed using pre-computed character info.""" 

256 self._character_count += 1 

257 

258 if ( 

259 character != self._last_printable_char 

260 and character not in COMMON_SAFE_ASCII_CHARACTERS 

261 ): 

262 if info.punct: 

263 self._punctuation_count += 1 

264 elif not info.digit and info.sym and not is_emoticon(character): 

265 self._symbol_count += 2 

266 

267 self._last_printable_char = character 

268 

269 def reset(self) -> None: # Abstract 

270 self._punctuation_count = 0 

271 self._character_count = 0 

272 self._symbol_count = 0 

273 

274 @property 

275 def ratio(self) -> float: 

276 if self._character_count == 0: 

277 return 0.0 

278 

279 ratio_of_punctuation: float = ( 

280 self._punctuation_count + self._symbol_count 

281 ) / self._character_count 

282 

283 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 

284 

285 

286@final 

287class TooManyAccentuatedPlugin(MessDetectorPlugin): 

288 __slots__ = ("_character_count", "_accentuated_count") 

289 

290 def __init__(self) -> None: 

291 self._character_count: int = 0 

292 self._accentuated_count: int = 0 

293 

294 def feed_info(self, character: str, info: CharInfo) -> None: 

295 """Optimized feed using pre-computed character info.""" 

296 self._character_count += 1 

297 

298 if info.accentuated: 

299 self._accentuated_count += 1 

300 

301 def reset(self) -> None: # Abstract 

302 self._character_count = 0 

303 self._accentuated_count = 0 

304 

305 @property 

306 def ratio(self) -> float: 

307 if self._character_count < 8: 

308 return 0.0 

309 

310 ratio_of_accentuation: float = self._accentuated_count / self._character_count 

311 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 

312 

313 

314@final 

315class UnprintablePlugin(MessDetectorPlugin): 

316 __slots__ = ("_unprintable_count", "_character_count") 

317 

318 def __init__(self) -> None: 

319 self._unprintable_count: int = 0 

320 self._character_count: int = 0 

321 

322 def feed_info(self, character: str, info: CharInfo) -> None: 

323 """Optimized feed using pre-computed character info.""" 

324 if ( 

325 not info.space 

326 and not info.printable 

327 and character != "\x1a" 

328 and character != "\ufeff" 

329 ): 

330 self._unprintable_count += 1 

331 self._character_count += 1 

332 

333 def reset(self) -> None: # Abstract 

334 self._unprintable_count = 0 

335 

336 @property 

337 def ratio(self) -> float: 

338 if self._character_count == 0: # Defensive: 

339 return 0.0 

340 

341 return (self._unprintable_count * 8) / self._character_count 

342 

343 

344@final 

345class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): 

346 __slots__ = ( 

347 "_successive_count", 

348 "_character_count", 

349 "_last_latin_character", 

350 "_last_was_accentuated", 

351 ) 

352 

353 def __init__(self) -> None: 

354 self._successive_count: int = 0 

355 self._character_count: int = 0 

356 

357 self._last_latin_character: str | None = None 

358 self._last_was_accentuated: bool = False 

359 

360 def feed_info(self, character: str, info: CharInfo) -> None: 

361 """Optimized feed using pre-computed character info.""" 

362 self._character_count += 1 

363 if ( 

364 self._last_latin_character is not None 

365 and info.accentuated 

366 and self._last_was_accentuated 

367 ): 

368 if info.upper and self._last_latin_character.isupper(): 

369 self._successive_count += 1 

370 if remove_accent(character) == remove_accent(self._last_latin_character): 

371 self._successive_count += 1 

372 self._last_latin_character = character 

373 self._last_was_accentuated = info.accentuated 

374 

375 def reset(self) -> None: # Abstract 

376 self._successive_count = 0 

377 self._character_count = 0 

378 self._last_latin_character = None 

379 self._last_was_accentuated = False 

380 

381 @property 

382 def ratio(self) -> float: 

383 if self._character_count == 0: 

384 return 0.0 

385 

386 return (self._successive_count * 2) / self._character_count 

387 

388 

389@final 

390class SuspiciousRange(MessDetectorPlugin): 

391 __slots__ = ( 

392 "_suspicious_successive_range_count", 

393 "_character_count", 

394 "_last_printable_seen", 

395 "_last_printable_range", 

396 ) 

397 

398 def __init__(self) -> None: 

399 self._suspicious_successive_range_count: int = 0 

400 self._character_count: int = 0 

401 self._last_printable_seen: str | None = None 

402 self._last_printable_range: str | None = None 

403 

404 def feed_info(self, character: str, info: CharInfo) -> None: 

405 """Optimized feed using pre-computed character info.""" 

406 self._character_count += 1 

407 

408 if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS: 

409 self._last_printable_seen = None 

410 self._last_printable_range = None 

411 return 

412 

413 if self._last_printable_seen is None: 

414 self._last_printable_seen = character 

415 self._last_printable_range = unicode_range(character) 

416 return 

417 

418 unicode_range_a: str | None = self._last_printable_range 

419 unicode_range_b: str | None = unicode_range(character) 

420 

421 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): 

422 self._suspicious_successive_range_count += 1 

423 

424 self._last_printable_seen = character 

425 self._last_printable_range = unicode_range_b 

426 

427 def reset(self) -> None: # Abstract 

428 self._character_count = 0 

429 self._suspicious_successive_range_count = 0 

430 self._last_printable_seen = None 

431 self._last_printable_range = None 

432 

433 @property 

434 def ratio(self) -> float: 

435 if self._character_count <= 13: 

436 return 0.0 

437 

438 ratio_of_suspicious_range_usage: float = ( 

439 self._suspicious_successive_range_count * 2 

440 ) / self._character_count 

441 

442 return ratio_of_suspicious_range_usage 

443 

444 

445@final 

446class SuperWeirdWordPlugin(MessDetectorPlugin): 

447 __slots__ = ( 

448 "_word_count", 

449 "_bad_word_count", 

450 "_foreign_long_count", 

451 "_is_current_word_bad", 

452 "_foreign_long_watch", 

453 "_character_count", 

454 "_bad_character_count", 

455 "_buffer_length", 

456 "_buffer_last_char", 

457 "_buffer_last_char_accentuated", 

458 "_buffer_accent_count", 

459 "_buffer_glyph_count", 

460 "_buffer_upper_count", 

461 ) 

462 

463 def __init__(self) -> None: 

464 self._word_count: int = 0 

465 self._bad_word_count: int = 0 

466 self._foreign_long_count: int = 0 

467 

468 self._is_current_word_bad: bool = False 

469 self._foreign_long_watch: bool = False 

470 

471 self._character_count: int = 0 

472 self._bad_character_count: int = 0 

473 

474 self._buffer_length: int = 0 

475 self._buffer_last_char: str | None = None 

476 self._buffer_last_char_accentuated: bool = False 

477 self._buffer_accent_count: int = 0 

478 self._buffer_glyph_count: int = 0 

479 self._buffer_upper_count: int = 0 

480 

481 def feed_info(self, character: str, info: CharInfo) -> None: 

482 """Optimized feed using pre-computed character info.""" 

483 if info.alpha: 

484 self._buffer_length += 1 

485 self._buffer_last_char = character 

486 

487 if info.upper: 

488 self._buffer_upper_count += 1 

489 

490 self._buffer_last_char_accentuated = info.accentuated 

491 

492 if info.accentuated: 

493 self._buffer_accent_count += 1 

494 if ( 

495 not self._foreign_long_watch 

496 and (not info.latin or info.accentuated) 

497 and not info.is_glyph 

498 ): 

499 self._foreign_long_watch = True 

500 if info.is_glyph: 

501 self._buffer_glyph_count += 1 

502 return 

503 if not self._buffer_length: 

504 return 

505 if info.space or info.punct or is_separator(character): 

506 self._word_count += 1 

507 buffer_length: int = self._buffer_length 

508 

509 self._character_count += buffer_length 

510 

511 if buffer_length >= 4: 

512 if self._buffer_accent_count / buffer_length >= 0.5: 

513 self._is_current_word_bad = True 

514 elif ( 

515 self._buffer_last_char_accentuated 

516 and self._buffer_last_char.isupper() # type: ignore[union-attr] 

517 and self._buffer_upper_count != buffer_length 

518 ): 

519 self._foreign_long_count += 1 

520 self._is_current_word_bad = True 

521 elif self._buffer_glyph_count == 1: 

522 self._is_current_word_bad = True 

523 self._foreign_long_count += 1 

524 if buffer_length >= 24 and self._foreign_long_watch: 

525 probable_camel_cased: bool = ( 

526 self._buffer_upper_count > 0 

527 and self._buffer_upper_count / buffer_length <= 0.3 

528 ) 

529 

530 if not probable_camel_cased: 

531 self._foreign_long_count += 1 

532 self._is_current_word_bad = True 

533 

534 if self._is_current_word_bad: 

535 self._bad_word_count += 1 

536 self._bad_character_count += buffer_length 

537 self._is_current_word_bad = False 

538 

539 self._foreign_long_watch = False 

540 self._buffer_length = 0 

541 self._buffer_last_char = None 

542 self._buffer_last_char_accentuated = False 

543 self._buffer_accent_count = 0 

544 self._buffer_glyph_count = 0 

545 self._buffer_upper_count = 0 

546 elif ( 

547 character not in {"<", ">", "-", "=", "~", "|", "_"} 

548 and not info.digit 

549 and info.sym 

550 ): 

551 self._is_current_word_bad = True 

552 self._buffer_length += 1 

553 self._buffer_last_char = character 

554 self._buffer_last_char_accentuated = False 

555 

556 def reset(self) -> None: # Abstract 

557 self._buffer_length = 0 

558 self._buffer_last_char = None 

559 self._buffer_last_char_accentuated = False 

560 self._is_current_word_bad = False 

561 self._foreign_long_watch = False 

562 self._bad_word_count = 0 

563 self._word_count = 0 

564 self._character_count = 0 

565 self._bad_character_count = 0 

566 self._foreign_long_count = 0 

567 self._buffer_accent_count = 0 

568 self._buffer_glyph_count = 0 

569 self._buffer_upper_count = 0 

570 

571 @property 

572 def ratio(self) -> float: 

573 if self._word_count <= 10 and self._foreign_long_count == 0: 

574 return 0.0 

575 

576 return self._bad_character_count / self._character_count 

577 

578 

579@final 

580class CjkUncommonPlugin(MessDetectorPlugin): 

581 """ 

582 Detect messy CJK text that probably means nothing. 

583 """ 

584 

585 __slots__ = ("_character_count", "_uncommon_count") 

586 

587 def __init__(self) -> None: 

588 self._character_count: int = 0 

589 self._uncommon_count: int = 0 

590 

591 def feed_info(self, character: str, info: CharInfo) -> None: 

592 """Optimized feed using pre-computed character info.""" 

593 self._character_count += 1 

594 

595 if character not in COMMON_CJK_CHARACTERS: 

596 self._uncommon_count += 1 

597 

598 def reset(self) -> None: # Abstract 

599 self._character_count = 0 

600 self._uncommon_count = 0 

601 

602 @property 

603 def ratio(self) -> float: 

604 if self._character_count < 8: 

605 return 0.0 

606 

607 uncommon_form_usage: float = self._uncommon_count / self._character_count 

608 

609 # we can be pretty sure it's garbage when uncommon characters are widely 

610 # used. otherwise it could just be traditional chinese for example. 

611 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0 

612 

613 

614@final 

615class ArchaicUpperLowerPlugin(MessDetectorPlugin): 

616 __slots__ = ( 

617 "_buf", 

618 "_character_count_since_last_sep", 

619 "_successive_upper_lower_count", 

620 "_successive_upper_lower_count_final", 

621 "_character_count", 

622 "_last_alpha_seen", 

623 "_last_alpha_seen_upper", 

624 "_last_alpha_seen_lower", 

625 "_current_ascii_only", 

626 ) 

627 

628 def __init__(self) -> None: 

629 self._buf: bool = False 

630 

631 self._character_count_since_last_sep: int = 0 

632 

633 self._successive_upper_lower_count: int = 0 

634 self._successive_upper_lower_count_final: int = 0 

635 

636 self._character_count: int = 0 

637 

638 self._last_alpha_seen: str | None = None 

639 self._last_alpha_seen_upper: bool = False 

640 self._last_alpha_seen_lower: bool = False 

641 self._current_ascii_only: bool = True 

642 

643 def feed_info(self, character: str, info: CharInfo) -> None: 

644 """Optimized feed using pre-computed character info.""" 

645 is_concerned: bool = info.alpha and info.case_variable 

646 chunk_sep: bool = not is_concerned 

647 

648 if chunk_sep and self._character_count_since_last_sep > 0: 

649 if ( 

650 self._character_count_since_last_sep <= 64 

651 and not info.digit 

652 and not self._current_ascii_only 

653 ): 

654 self._successive_upper_lower_count_final += ( 

655 self._successive_upper_lower_count 

656 ) 

657 

658 self._successive_upper_lower_count = 0 

659 self._character_count_since_last_sep = 0 

660 self._last_alpha_seen = None 

661 self._buf = False 

662 self._character_count += 1 

663 self._current_ascii_only = True 

664 

665 return 

666 

667 if self._current_ascii_only and not info.is_ascii: 

668 self._current_ascii_only = False 

669 

670 if self._last_alpha_seen is not None: 

671 if (info.upper and self._last_alpha_seen_lower) or ( 

672 info.lower and self._last_alpha_seen_upper 

673 ): 

674 if self._buf: 

675 self._successive_upper_lower_count += 2 

676 self._buf = False 

677 else: 

678 self._buf = True 

679 else: 

680 self._buf = False 

681 

682 self._character_count += 1 

683 self._character_count_since_last_sep += 1 

684 self._last_alpha_seen = character 

685 self._last_alpha_seen_upper = info.upper 

686 self._last_alpha_seen_lower = info.lower 

687 

688 def reset(self) -> None: # Abstract 

689 self._character_count = 0 

690 self._character_count_since_last_sep = 0 

691 self._successive_upper_lower_count = 0 

692 self._successive_upper_lower_count_final = 0 

693 self._last_alpha_seen = None 

694 self._last_alpha_seen_upper = False 

695 self._last_alpha_seen_lower = False 

696 self._buf = False 

697 self._current_ascii_only = True 

698 

699 @property 

700 def ratio(self) -> float: 

701 if self._character_count == 0: # Defensive: 

702 return 0.0 

703 

704 return self._successive_upper_lower_count_final / self._character_count 

705 

706 

707@final 

708class ArabicIsolatedFormPlugin(MessDetectorPlugin): 

709 __slots__ = ("_character_count", "_isolated_form_count") 

710 

711 def __init__(self) -> None: 

712 self._character_count: int = 0 

713 self._isolated_form_count: int = 0 

714 

715 def reset(self) -> None: # Abstract 

716 self._character_count = 0 

717 self._isolated_form_count = 0 

718 

719 def feed_info(self, character: str, info: CharInfo) -> None: 

720 """Optimized feed using pre-computed character info.""" 

721 self._character_count += 1 

722 

723 if info.flags & _ARABIC_ISOLATED_FORM: 

724 self._isolated_form_count += 1 

725 

726 @property 

727 def ratio(self) -> float: 

728 if self._character_count < 8: 

729 return 0.0 

730 

731 isolated_form_usage: float = self._isolated_form_count / self._character_count 

732 

733 return isolated_form_usage 

734 

735 

736@lru_cache(maxsize=1024) 

737def is_suspiciously_successive_range( 

738 unicode_range_a: str | None, unicode_range_b: str | None 

739) -> bool: 

740 """ 

741 Determine if two Unicode range seen next to each other can be considered as suspicious. 

742 """ 

743 if unicode_range_a is None or unicode_range_b is None: 

744 return True 

745 

746 if unicode_range_a == unicode_range_b: 

747 return False 

748 

749 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: 

750 return False 

751 

752 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: 

753 return False 

754 

755 # Latin characters can be accompanied with a combining diacritical mark 

756 # eg. Vietnamese. 

757 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( 

758 "Combining" in unicode_range_a or "Combining" in unicode_range_b 

759 ): 

760 return False 

761 

762 keywords_range_a, keywords_range_b = ( 

763 unicode_range_a.split(" "), 

764 unicode_range_b.split(" "), 

765 ) 

766 

767 for el in keywords_range_a: 

768 if el in UNICODE_SECONDARY_RANGE_KEYWORD: 

769 continue 

770 if el in keywords_range_b: 

771 return False 

772 

773 # Japanese Exception 

774 range_a_jp_chars, range_b_jp_chars = ( 

775 unicode_range_a 

776 in ( 

777 "Hiragana", 

778 "Katakana", 

779 ), 

780 unicode_range_b in ("Hiragana", "Katakana"), 

781 ) 

782 if (range_a_jp_chars or range_b_jp_chars) and ( 

783 "CJK" in unicode_range_a or "CJK" in unicode_range_b 

784 ): 

785 return False 

786 if range_a_jp_chars and range_b_jp_chars: 

787 return False 

788 

789 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: 

790 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: 

791 return False 

792 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

793 return False 

794 

795 # Chinese/Japanese use dedicated range for punctuation and/or separators. 

796 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( 

797 unicode_range_a in ["Katakana", "Hiragana"] 

798 and unicode_range_b in ["Katakana", "Hiragana"] 

799 ): 

800 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: 

801 return False 

802 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: 

803 return False 

804 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

805 return False 

806 

807 return True 

808 

809 

810@lru_cache(maxsize=2048) 

811def mess_ratio( 

812 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False 

813) -> float: 

814 """ 

815 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. 

816 """ 

817 

818 seq_len: int = len(decoded_sequence) 

819 

820 if seq_len < 511: 

821 step: int = 32 

822 elif seq_len < 1024: 

823 step = 64 

824 else: 

825 step = 128 

826 

827 # Create each detector as a named local variable (unrolled from the generic loop). 

828 # This eliminates per-character iteration over the detector list and 

829 # per-character eligible() virtual dispatch, while keeping every plugin class 

830 # intact and fully readable. 

831 d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin() 

832 d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin() 

833 d_up: UnprintablePlugin = UnprintablePlugin() 

834 d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin() 

835 d_sr: SuspiciousRange = SuspiciousRange() 

836 d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin() 

837 d_cu: CjkUncommonPlugin = CjkUncommonPlugin() 

838 d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin() 

839 d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin() 

840 

841 # Local references for feed_info methods called in the hot loop. 

842 d_sp_feed = d_sp.feed_info 

843 d_ta_feed = d_ta.feed_info 

844 d_up_feed = d_up.feed_info 

845 d_sda_feed = d_sda.feed_info 

846 d_sr_feed = d_sr.feed_info 

847 d_sw_feed = d_sw.feed_info 

848 d_cu_feed = d_cu.feed_info 

849 d_au_feed = d_au.feed_info 

850 d_ai_feed = d_ai.feed_info 

851 

852 # Single reusable CharInfo object (avoids per-character allocation). 

853 info: CharInfo = CharInfo() 

854 info_update = info.update 

855 

856 mean_mess_ratio: float 

857 

858 for block_start in range(0, seq_len, step): 

859 for character in decoded_sequence[block_start : block_start + step]: 

860 # Pre-compute all character properties once (shared across all plugins). 

861 info_update(character) 

862 

863 # Detectors with eligible() == always True 

864 d_up_feed(character, info) 

865 d_sw_feed(character, info) 

866 d_au_feed(character, info) 

867 

868 # Detectors with eligible() == isprintable 

869 if info.printable: 

870 d_sp_feed(character, info) 

871 d_sr_feed(character, info) 

872 

873 # Detectors with eligible() == isalpha 

874 if info.alpha: 

875 d_ta_feed(character, info) 

876 # SuspiciousDuplicateAccent: isalpha() and is_latin() 

877 if info.latin: 

878 d_sda_feed(character, info) 

879 # CjkUncommon: is_cjk() 

880 if info.is_cjk: 

881 d_cu_feed(character, info) 

882 # ArabicIsolatedForm: is_arabic() 

883 if info.is_arabic: 

884 d_ai_feed(character, info) 

885 

886 mean_mess_ratio = ( 

887 d_sp.ratio 

888 + d_ta.ratio 

889 + d_up.ratio 

890 + d_sda.ratio 

891 + d_sr.ratio 

892 + d_sw.ratio 

893 + d_cu.ratio 

894 + d_au.ratio 

895 + d_ai.ratio 

896 ) 

897 

898 if mean_mess_ratio >= maximum_threshold: 

899 break 

900 else: 

901 # Flush last word buffer in SuperWeirdWordPlugin via trailing newline. 

902 info_update("\n") 

903 d_sw_feed("\n", info) 

904 d_au_feed("\n", info) 

905 d_up_feed("\n", info) 

906 

907 mean_mess_ratio = ( 

908 d_sp.ratio 

909 + d_ta.ratio 

910 + d_up.ratio 

911 + d_sda.ratio 

912 + d_sr.ratio 

913 + d_sw.ratio 

914 + d_cu.ratio 

915 + d_au.ratio 

916 + d_ai.ratio 

917 ) 

918 

919 if debug: # Defensive: 

920 logger = getLogger("charset_normalizer") 

921 

922 logger.log( 

923 TRACE, 

924 "Mess-detector extended-analysis start. " 

925 f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} " 

926 f"maximum_threshold={maximum_threshold}", 

927 ) 

928 

929 if seq_len > 16: 

930 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") 

931 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") 

932 

933 for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]: 

934 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") 

935 

936 return round(mean_mess_ratio, 3)