1from __future__ import annotations
2
3import sys
4from functools import lru_cache
5from logging import getLogger
6
7if sys.version_info >= (3, 8):
8 from typing import final
9else:
10 try:
11 from typing_extensions import final
12 except ImportError:
13
14 def final(cls): # type: ignore[misc,no-untyped-def]
15 return cls
16
17
18from .constant import (
19 COMMON_CJK_CHARACTERS,
20 COMMON_SAFE_ASCII_CHARACTERS,
21 TRACE,
22 UNICODE_SECONDARY_RANGE_KEYWORD,
23 _ACCENTUATED,
24 _ARABIC,
25 _ARABIC_ISOLATED_FORM,
26 _CJK,
27 _HANGUL,
28 _HIRAGANA,
29 _KATAKANA,
30 _LATIN,
31 _THAI,
32)
33from .utils import (
34 _character_flags,
35 is_emoticon,
36 is_punctuation,
37 is_separator,
38 is_symbol,
39 remove_accent,
40 unicode_range,
41)
42
43# Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
44_GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
45
46
47@final
48class CharInfo:
49 """Pre-computed character properties shared across all detectors.
50
51 Instantiated once and reused via :meth:`update` on every character
52 in the hot loop so that redundant calls to str methods
53 (``isalpha``, ``isupper``, …) and cached utility functions
54 (``_character_flags``, ``is_punctuation``, …) are avoided when
55 several plugins need the same information.
56 """
57
58 __slots__ = (
59 "character",
60 "printable",
61 "alpha",
62 "upper",
63 "lower",
64 "space",
65 "digit",
66 "is_ascii",
67 "case_variable",
68 "flags",
69 "accentuated",
70 "latin",
71 "is_cjk",
72 "is_arabic",
73 "is_glyph",
74 "punct",
75 "sym",
76 )
77
78 def __init__(self) -> None:
79 self.character: str = ""
80 self.printable: bool = False
81 self.alpha: bool = False
82 self.upper: bool = False
83 self.lower: bool = False
84 self.space: bool = False
85 self.digit: bool = False
86 self.is_ascii: bool = False
87 self.case_variable: bool = False
88 self.flags: int = 0
89 self.accentuated: bool = False
90 self.latin: bool = False
91 self.is_cjk: bool = False
92 self.is_arabic: bool = False
93 self.is_glyph: bool = False
94 self.punct: bool = False
95 self.sym: bool = False
96
97 def update(self, character: str) -> None:
98 """Update all properties for *character* (called once per character)."""
99 self.character = character
100
101 # ASCII fast-path: for characters with ord < 128, we can skip
102 # _character_flags() entirely and derive most properties from ord.
103 o: int = ord(character)
104 if o < 128:
105 self.is_ascii = True
106 self.accentuated = False
107 self.is_cjk = False
108 self.is_arabic = False
109 self.is_glyph = False
110 # ASCII alpha: a-z (97-122) or A-Z (65-90)
111 if 65 <= o <= 90:
112 # Uppercase ASCII letter
113 self.alpha = True
114 self.upper = True
115 self.lower = False
116 self.space = False
117 self.digit = False
118 self.printable = True
119 self.case_variable = True
120 self.flags = _LATIN
121 self.latin = True
122 self.punct = False
123 self.sym = False
124 elif 97 <= o <= 122:
125 # Lowercase ASCII letter
126 self.alpha = True
127 self.upper = False
128 self.lower = True
129 self.space = False
130 self.digit = False
131 self.printable = True
132 self.case_variable = True
133 self.flags = _LATIN
134 self.latin = True
135 self.punct = False
136 self.sym = False
137 elif 48 <= o <= 57:
138 # ASCII digit 0-9
139 self.alpha = False
140 self.upper = False
141 self.lower = False
142 self.space = False
143 self.digit = True
144 self.printable = True
145 self.case_variable = False
146 self.flags = 0
147 self.latin = False
148 self.punct = False
149 self.sym = False
150 elif o == 32 or (9 <= o <= 13):
151 # Space, tab, newline, etc.
152 self.alpha = False
153 self.upper = False
154 self.lower = False
155 self.space = True
156 self.digit = False
157 self.printable = o == 32
158 self.case_variable = False
159 self.flags = 0
160 self.latin = False
161 self.punct = False
162 self.sym = False
163 else:
164 # Other ASCII (punctuation, symbols, control chars)
165 self.printable = character.isprintable()
166 self.alpha = False
167 self.upper = False
168 self.lower = False
169 self.space = False
170 self.digit = False
171 self.case_variable = False
172 self.flags = 0
173 self.latin = False
174 self.punct = is_punctuation(character) if self.printable else False
175 self.sym = is_symbol(character) if self.printable else False
176 else:
177 # Non-ASCII path
178 self.is_ascii = False
179 self.printable = character.isprintable()
180 self.alpha = character.isalpha()
181 self.upper = character.isupper()
182 self.lower = character.islower()
183 self.space = character.isspace()
184 self.digit = character.isdigit()
185 self.case_variable = self.lower != self.upper
186
187 # Flag-based classification (single unicodedata.name() call, lru-cached)
188 flags: int
189 if self.alpha:
190 flags = _character_flags(character)
191 else:
192 flags = 0
193 self.flags = flags
194 self.accentuated = bool(flags & _ACCENTUATED)
195 self.latin = bool(flags & _LATIN)
196 self.is_cjk = bool(flags & _CJK)
197 self.is_arabic = bool(flags & _ARABIC)
198 self.is_glyph = bool(flags & _GLYPH_MASK)
199
200 # Eagerly compute punct and sym (avoids property dispatch overhead
201 # on 300K+ accesses in the hot loop).
202 self.punct = is_punctuation(character) if self.printable else False
203 self.sym = is_symbol(character) if self.printable else False
204
205
206class MessDetectorPlugin:
207 """
208 Base abstract class used for mess detection plugins.
209 All detectors MUST extend and implement given methods.
210 """
211
212 __slots__ = ()
213
214 def feed_info(self, character: str, info: CharInfo) -> None:
215 """
216 The main routine to be executed upon character.
217 Insert the logic in witch the text would be considered chaotic.
218 """
219 raise NotImplementedError # Defensive:
220
221 def reset(self) -> None: # Defensive:
222 """
223 Permit to reset the plugin to the initial state.
224 """
225 raise NotImplementedError
226
227 @property
228 def ratio(self) -> float:
229 """
230 Compute the chaos ratio based on what your feed() has seen.
231 Must NOT be lower than 0.; No restriction gt 0.
232 """
233 raise NotImplementedError # Defensive:
234
235
236@final
237class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
238 __slots__ = (
239 "_punctuation_count",
240 "_symbol_count",
241 "_character_count",
242 "_last_printable_char",
243 "_frenzy_symbol_in_word",
244 )
245
246 def __init__(self) -> None:
247 self._punctuation_count: int = 0
248 self._symbol_count: int = 0
249 self._character_count: int = 0
250
251 self._last_printable_char: str | None = None
252 self._frenzy_symbol_in_word: bool = False
253
254 def feed_info(self, character: str, info: CharInfo) -> None:
255 """Optimized feed using pre-computed character info."""
256 self._character_count += 1
257
258 if (
259 character != self._last_printable_char
260 and character not in COMMON_SAFE_ASCII_CHARACTERS
261 ):
262 if info.punct:
263 self._punctuation_count += 1
264 elif not info.digit and info.sym and not is_emoticon(character):
265 self._symbol_count += 2
266
267 self._last_printable_char = character
268
269 def reset(self) -> None: # Abstract
270 self._punctuation_count = 0
271 self._character_count = 0
272 self._symbol_count = 0
273
274 @property
275 def ratio(self) -> float:
276 if self._character_count == 0:
277 return 0.0
278
279 ratio_of_punctuation: float = (
280 self._punctuation_count + self._symbol_count
281 ) / self._character_count
282
283 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
284
285
286@final
287class TooManyAccentuatedPlugin(MessDetectorPlugin):
288 __slots__ = ("_character_count", "_accentuated_count")
289
290 def __init__(self) -> None:
291 self._character_count: int = 0
292 self._accentuated_count: int = 0
293
294 def feed_info(self, character: str, info: CharInfo) -> None:
295 """Optimized feed using pre-computed character info."""
296 self._character_count += 1
297
298 if info.accentuated:
299 self._accentuated_count += 1
300
301 def reset(self) -> None: # Abstract
302 self._character_count = 0
303 self._accentuated_count = 0
304
305 @property
306 def ratio(self) -> float:
307 if self._character_count < 8:
308 return 0.0
309
310 ratio_of_accentuation: float = self._accentuated_count / self._character_count
311 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
312
313
314@final
315class UnprintablePlugin(MessDetectorPlugin):
316 __slots__ = ("_unprintable_count", "_character_count")
317
318 def __init__(self) -> None:
319 self._unprintable_count: int = 0
320 self._character_count: int = 0
321
322 def feed_info(self, character: str, info: CharInfo) -> None:
323 """Optimized feed using pre-computed character info."""
324 if (
325 not info.space
326 and not info.printable
327 and character != "\x1a"
328 and character != "\ufeff"
329 ):
330 self._unprintable_count += 1
331 self._character_count += 1
332
333 def reset(self) -> None: # Abstract
334 self._unprintable_count = 0
335
336 @property
337 def ratio(self) -> float:
338 if self._character_count == 0: # Defensive:
339 return 0.0
340
341 return (self._unprintable_count * 8) / self._character_count
342
343
344@final
345class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
346 __slots__ = (
347 "_successive_count",
348 "_character_count",
349 "_last_latin_character",
350 "_last_was_accentuated",
351 )
352
353 def __init__(self) -> None:
354 self._successive_count: int = 0
355 self._character_count: int = 0
356
357 self._last_latin_character: str | None = None
358 self._last_was_accentuated: bool = False
359
360 def feed_info(self, character: str, info: CharInfo) -> None:
361 """Optimized feed using pre-computed character info."""
362 self._character_count += 1
363 if (
364 self._last_latin_character is not None
365 and info.accentuated
366 and self._last_was_accentuated
367 ):
368 if info.upper and self._last_latin_character.isupper():
369 self._successive_count += 1
370 if remove_accent(character) == remove_accent(self._last_latin_character):
371 self._successive_count += 1
372 self._last_latin_character = character
373 self._last_was_accentuated = info.accentuated
374
375 def reset(self) -> None: # Abstract
376 self._successive_count = 0
377 self._character_count = 0
378 self._last_latin_character = None
379 self._last_was_accentuated = False
380
381 @property
382 def ratio(self) -> float:
383 if self._character_count == 0:
384 return 0.0
385
386 return (self._successive_count * 2) / self._character_count
387
388
389@final
390class SuspiciousRange(MessDetectorPlugin):
391 __slots__ = (
392 "_suspicious_successive_range_count",
393 "_character_count",
394 "_last_printable_seen",
395 "_last_printable_range",
396 )
397
398 def __init__(self) -> None:
399 self._suspicious_successive_range_count: int = 0
400 self._character_count: int = 0
401 self._last_printable_seen: str | None = None
402 self._last_printable_range: str | None = None
403
404 def feed_info(self, character: str, info: CharInfo) -> None:
405 """Optimized feed using pre-computed character info."""
406 self._character_count += 1
407
408 if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
409 self._last_printable_seen = None
410 self._last_printable_range = None
411 return
412
413 if self._last_printable_seen is None:
414 self._last_printable_seen = character
415 self._last_printable_range = unicode_range(character)
416 return
417
418 unicode_range_a: str | None = self._last_printable_range
419 unicode_range_b: str | None = unicode_range(character)
420
421 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
422 self._suspicious_successive_range_count += 1
423
424 self._last_printable_seen = character
425 self._last_printable_range = unicode_range_b
426
427 def reset(self) -> None: # Abstract
428 self._character_count = 0
429 self._suspicious_successive_range_count = 0
430 self._last_printable_seen = None
431 self._last_printable_range = None
432
433 @property
434 def ratio(self) -> float:
435 if self._character_count <= 13:
436 return 0.0
437
438 ratio_of_suspicious_range_usage: float = (
439 self._suspicious_successive_range_count * 2
440 ) / self._character_count
441
442 return ratio_of_suspicious_range_usage
443
444
445@final
446class SuperWeirdWordPlugin(MessDetectorPlugin):
447 __slots__ = (
448 "_word_count",
449 "_bad_word_count",
450 "_foreign_long_count",
451 "_is_current_word_bad",
452 "_foreign_long_watch",
453 "_character_count",
454 "_bad_character_count",
455 "_buffer_length",
456 "_buffer_last_char",
457 "_buffer_last_char_accentuated",
458 "_buffer_accent_count",
459 "_buffer_glyph_count",
460 "_buffer_upper_count",
461 )
462
463 def __init__(self) -> None:
464 self._word_count: int = 0
465 self._bad_word_count: int = 0
466 self._foreign_long_count: int = 0
467
468 self._is_current_word_bad: bool = False
469 self._foreign_long_watch: bool = False
470
471 self._character_count: int = 0
472 self._bad_character_count: int = 0
473
474 self._buffer_length: int = 0
475 self._buffer_last_char: str | None = None
476 self._buffer_last_char_accentuated: bool = False
477 self._buffer_accent_count: int = 0
478 self._buffer_glyph_count: int = 0
479 self._buffer_upper_count: int = 0
480
481 def feed_info(self, character: str, info: CharInfo) -> None:
482 """Optimized feed using pre-computed character info."""
483 if info.alpha:
484 self._buffer_length += 1
485 self._buffer_last_char = character
486
487 if info.upper:
488 self._buffer_upper_count += 1
489
490 self._buffer_last_char_accentuated = info.accentuated
491
492 if info.accentuated:
493 self._buffer_accent_count += 1
494 if (
495 not self._foreign_long_watch
496 and (not info.latin or info.accentuated)
497 and not info.is_glyph
498 ):
499 self._foreign_long_watch = True
500 if info.is_glyph:
501 self._buffer_glyph_count += 1
502 return
503 if not self._buffer_length:
504 return
505 if info.space or info.punct or is_separator(character):
506 self._word_count += 1
507 buffer_length: int = self._buffer_length
508
509 self._character_count += buffer_length
510
511 if buffer_length >= 4:
512 if self._buffer_accent_count / buffer_length >= 0.5:
513 self._is_current_word_bad = True
514 elif (
515 self._buffer_last_char_accentuated
516 and self._buffer_last_char.isupper() # type: ignore[union-attr]
517 and self._buffer_upper_count != buffer_length
518 ):
519 self._foreign_long_count += 1
520 self._is_current_word_bad = True
521 elif self._buffer_glyph_count == 1:
522 self._is_current_word_bad = True
523 self._foreign_long_count += 1
524 if buffer_length >= 24 and self._foreign_long_watch:
525 probable_camel_cased: bool = (
526 self._buffer_upper_count > 0
527 and self._buffer_upper_count / buffer_length <= 0.3
528 )
529
530 if not probable_camel_cased:
531 self._foreign_long_count += 1
532 self._is_current_word_bad = True
533
534 if self._is_current_word_bad:
535 self._bad_word_count += 1
536 self._bad_character_count += buffer_length
537 self._is_current_word_bad = False
538
539 self._foreign_long_watch = False
540 self._buffer_length = 0
541 self._buffer_last_char = None
542 self._buffer_last_char_accentuated = False
543 self._buffer_accent_count = 0
544 self._buffer_glyph_count = 0
545 self._buffer_upper_count = 0
546 elif (
547 character not in {"<", ">", "-", "=", "~", "|", "_"}
548 and not info.digit
549 and info.sym
550 ):
551 self._is_current_word_bad = True
552 self._buffer_length += 1
553 self._buffer_last_char = character
554 self._buffer_last_char_accentuated = False
555
556 def reset(self) -> None: # Abstract
557 self._buffer_length = 0
558 self._buffer_last_char = None
559 self._buffer_last_char_accentuated = False
560 self._is_current_word_bad = False
561 self._foreign_long_watch = False
562 self._bad_word_count = 0
563 self._word_count = 0
564 self._character_count = 0
565 self._bad_character_count = 0
566 self._foreign_long_count = 0
567 self._buffer_accent_count = 0
568 self._buffer_glyph_count = 0
569 self._buffer_upper_count = 0
570
571 @property
572 def ratio(self) -> float:
573 if self._word_count <= 10 and self._foreign_long_count == 0:
574 return 0.0
575
576 return self._bad_character_count / self._character_count
577
578
579@final
580class CjkUncommonPlugin(MessDetectorPlugin):
581 """
582 Detect messy CJK text that probably means nothing.
583 """
584
585 __slots__ = ("_character_count", "_uncommon_count")
586
587 def __init__(self) -> None:
588 self._character_count: int = 0
589 self._uncommon_count: int = 0
590
591 def feed_info(self, character: str, info: CharInfo) -> None:
592 """Optimized feed using pre-computed character info."""
593 self._character_count += 1
594
595 if character not in COMMON_CJK_CHARACTERS:
596 self._uncommon_count += 1
597
598 def reset(self) -> None: # Abstract
599 self._character_count = 0
600 self._uncommon_count = 0
601
602 @property
603 def ratio(self) -> float:
604 if self._character_count < 8:
605 return 0.0
606
607 uncommon_form_usage: float = self._uncommon_count / self._character_count
608
609 # we can be pretty sure it's garbage when uncommon characters are widely
610 # used. otherwise it could just be traditional chinese for example.
611 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
612
613
614@final
615class ArchaicUpperLowerPlugin(MessDetectorPlugin):
616 __slots__ = (
617 "_buf",
618 "_character_count_since_last_sep",
619 "_successive_upper_lower_count",
620 "_successive_upper_lower_count_final",
621 "_character_count",
622 "_last_alpha_seen",
623 "_last_alpha_seen_upper",
624 "_last_alpha_seen_lower",
625 "_current_ascii_only",
626 )
627
628 def __init__(self) -> None:
629 self._buf: bool = False
630
631 self._character_count_since_last_sep: int = 0
632
633 self._successive_upper_lower_count: int = 0
634 self._successive_upper_lower_count_final: int = 0
635
636 self._character_count: int = 0
637
638 self._last_alpha_seen: str | None = None
639 self._last_alpha_seen_upper: bool = False
640 self._last_alpha_seen_lower: bool = False
641 self._current_ascii_only: bool = True
642
643 def feed_info(self, character: str, info: CharInfo) -> None:
644 """Optimized feed using pre-computed character info."""
645 is_concerned: bool = info.alpha and info.case_variable
646 chunk_sep: bool = not is_concerned
647
648 if chunk_sep and self._character_count_since_last_sep > 0:
649 if (
650 self._character_count_since_last_sep <= 64
651 and not info.digit
652 and not self._current_ascii_only
653 ):
654 self._successive_upper_lower_count_final += (
655 self._successive_upper_lower_count
656 )
657
658 self._successive_upper_lower_count = 0
659 self._character_count_since_last_sep = 0
660 self._last_alpha_seen = None
661 self._buf = False
662 self._character_count += 1
663 self._current_ascii_only = True
664
665 return
666
667 if self._current_ascii_only and not info.is_ascii:
668 self._current_ascii_only = False
669
670 if self._last_alpha_seen is not None:
671 if (info.upper and self._last_alpha_seen_lower) or (
672 info.lower and self._last_alpha_seen_upper
673 ):
674 if self._buf:
675 self._successive_upper_lower_count += 2
676 self._buf = False
677 else:
678 self._buf = True
679 else:
680 self._buf = False
681
682 self._character_count += 1
683 self._character_count_since_last_sep += 1
684 self._last_alpha_seen = character
685 self._last_alpha_seen_upper = info.upper
686 self._last_alpha_seen_lower = info.lower
687
688 def reset(self) -> None: # Abstract
689 self._character_count = 0
690 self._character_count_since_last_sep = 0
691 self._successive_upper_lower_count = 0
692 self._successive_upper_lower_count_final = 0
693 self._last_alpha_seen = None
694 self._last_alpha_seen_upper = False
695 self._last_alpha_seen_lower = False
696 self._buf = False
697 self._current_ascii_only = True
698
699 @property
700 def ratio(self) -> float:
701 if self._character_count == 0: # Defensive:
702 return 0.0
703
704 return self._successive_upper_lower_count_final / self._character_count
705
706
707@final
708class ArabicIsolatedFormPlugin(MessDetectorPlugin):
709 __slots__ = ("_character_count", "_isolated_form_count")
710
711 def __init__(self) -> None:
712 self._character_count: int = 0
713 self._isolated_form_count: int = 0
714
715 def reset(self) -> None: # Abstract
716 self._character_count = 0
717 self._isolated_form_count = 0
718
719 def feed_info(self, character: str, info: CharInfo) -> None:
720 """Optimized feed using pre-computed character info."""
721 self._character_count += 1
722
723 if info.flags & _ARABIC_ISOLATED_FORM:
724 self._isolated_form_count += 1
725
726 @property
727 def ratio(self) -> float:
728 if self._character_count < 8:
729 return 0.0
730
731 isolated_form_usage: float = self._isolated_form_count / self._character_count
732
733 return isolated_form_usage
734
735
736@lru_cache(maxsize=1024)
737def is_suspiciously_successive_range(
738 unicode_range_a: str | None, unicode_range_b: str | None
739) -> bool:
740 """
741 Determine if two Unicode range seen next to each other can be considered as suspicious.
742 """
743 if unicode_range_a is None or unicode_range_b is None:
744 return True
745
746 if unicode_range_a == unicode_range_b:
747 return False
748
749 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
750 return False
751
752 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
753 return False
754
755 # Latin characters can be accompanied with a combining diacritical mark
756 # eg. Vietnamese.
757 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
758 "Combining" in unicode_range_a or "Combining" in unicode_range_b
759 ):
760 return False
761
762 keywords_range_a, keywords_range_b = (
763 unicode_range_a.split(" "),
764 unicode_range_b.split(" "),
765 )
766
767 for el in keywords_range_a:
768 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
769 continue
770 if el in keywords_range_b:
771 return False
772
773 # Japanese Exception
774 range_a_jp_chars, range_b_jp_chars = (
775 unicode_range_a
776 in (
777 "Hiragana",
778 "Katakana",
779 ),
780 unicode_range_b in ("Hiragana", "Katakana"),
781 )
782 if (range_a_jp_chars or range_b_jp_chars) and (
783 "CJK" in unicode_range_a or "CJK" in unicode_range_b
784 ):
785 return False
786 if range_a_jp_chars and range_b_jp_chars:
787 return False
788
789 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
790 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
791 return False
792 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
793 return False
794
795 # Chinese/Japanese use dedicated range for punctuation and/or separators.
796 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
797 unicode_range_a in ["Katakana", "Hiragana"]
798 and unicode_range_b in ["Katakana", "Hiragana"]
799 ):
800 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
801 return False
802 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
803 return False
804 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
805 return False
806
807 return True
808
809
810@lru_cache(maxsize=2048)
811def mess_ratio(
812 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
813) -> float:
814 """
815 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
816 """
817
818 seq_len: int = len(decoded_sequence)
819
820 if seq_len < 511:
821 step: int = 32
822 elif seq_len < 1024:
823 step = 64
824 else:
825 step = 128
826
827 # Create each detector as a named local variable (unrolled from the generic loop).
828 # This eliminates per-character iteration over the detector list and
829 # per-character eligible() virtual dispatch, while keeping every plugin class
830 # intact and fully readable.
831 d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
832 d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
833 d_up: UnprintablePlugin = UnprintablePlugin()
834 d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
835 d_sr: SuspiciousRange = SuspiciousRange()
836 d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
837 d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
838 d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
839 d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
840
841 # Local references for feed_info methods called in the hot loop.
842 d_sp_feed = d_sp.feed_info
843 d_ta_feed = d_ta.feed_info
844 d_up_feed = d_up.feed_info
845 d_sda_feed = d_sda.feed_info
846 d_sr_feed = d_sr.feed_info
847 d_sw_feed = d_sw.feed_info
848 d_cu_feed = d_cu.feed_info
849 d_au_feed = d_au.feed_info
850 d_ai_feed = d_ai.feed_info
851
852 # Single reusable CharInfo object (avoids per-character allocation).
853 info: CharInfo = CharInfo()
854 info_update = info.update
855
856 mean_mess_ratio: float
857
858 for block_start in range(0, seq_len, step):
859 for character in decoded_sequence[block_start : block_start + step]:
860 # Pre-compute all character properties once (shared across all plugins).
861 info_update(character)
862
863 # Detectors with eligible() == always True
864 d_up_feed(character, info)
865 d_sw_feed(character, info)
866 d_au_feed(character, info)
867
868 # Detectors with eligible() == isprintable
869 if info.printable:
870 d_sp_feed(character, info)
871 d_sr_feed(character, info)
872
873 # Detectors with eligible() == isalpha
874 if info.alpha:
875 d_ta_feed(character, info)
876 # SuspiciousDuplicateAccent: isalpha() and is_latin()
877 if info.latin:
878 d_sda_feed(character, info)
879 # CjkUncommon: is_cjk()
880 if info.is_cjk:
881 d_cu_feed(character, info)
882 # ArabicIsolatedForm: is_arabic()
883 if info.is_arabic:
884 d_ai_feed(character, info)
885
886 mean_mess_ratio = (
887 d_sp.ratio
888 + d_ta.ratio
889 + d_up.ratio
890 + d_sda.ratio
891 + d_sr.ratio
892 + d_sw.ratio
893 + d_cu.ratio
894 + d_au.ratio
895 + d_ai.ratio
896 )
897
898 if mean_mess_ratio >= maximum_threshold:
899 break
900 else:
901 # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
902 info_update("\n")
903 d_sw_feed("\n", info)
904 d_au_feed("\n", info)
905 d_up_feed("\n", info)
906
907 mean_mess_ratio = (
908 d_sp.ratio
909 + d_ta.ratio
910 + d_up.ratio
911 + d_sda.ratio
912 + d_sr.ratio
913 + d_sw.ratio
914 + d_cu.ratio
915 + d_au.ratio
916 + d_ai.ratio
917 )
918
919 if debug: # Defensive:
920 logger = getLogger("charset_normalizer")
921
922 logger.log(
923 TRACE,
924 "Mess-detector extended-analysis start. "
925 f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
926 f"maximum_threshold={maximum_threshold}",
927 )
928
929 if seq_len > 16:
930 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
931 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
932
933 for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
934 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
935
936 return round(mean_mess_ratio, 3)