1from __future__ import annotations
2
3from functools import lru_cache
4from logging import getLogger
5
6from .constant import (
7 COMMON_SAFE_ASCII_CHARACTERS,
8 TRACE,
9 UNICODE_SECONDARY_RANGE_KEYWORD,
10)
11from .utils import (
12 is_accentuated,
13 is_arabic,
14 is_arabic_isolated_form,
15 is_case_variable,
16 is_cjk,
17 is_emoticon,
18 is_hangul,
19 is_hiragana,
20 is_katakana,
21 is_latin,
22 is_punctuation,
23 is_separator,
24 is_symbol,
25 is_thai,
26 is_unprintable,
27 remove_accent,
28 unicode_range,
29 is_cjk_uncommon,
30)
31
32
33class MessDetectorPlugin:
34 """
35 Base abstract class used for mess detection plugins.
36 All detectors MUST extend and implement given methods.
37 """
38
39 def eligible(self, character: str) -> bool:
40 """
41 Determine if given character should be fed in.
42 """
43 raise NotImplementedError # pragma: nocover
44
45 def feed(self, character: str) -> None:
46 """
47 The main routine to be executed upon character.
48 Insert the logic in witch the text would be considered chaotic.
49 """
50 raise NotImplementedError # pragma: nocover
51
52 def reset(self) -> None: # pragma: no cover
53 """
54 Permit to reset the plugin to the initial state.
55 """
56 raise NotImplementedError
57
58 @property
59 def ratio(self) -> float:
60 """
61 Compute the chaos ratio based on what your feed() has seen.
62 Must NOT be lower than 0.; No restriction gt 0.
63 """
64 raise NotImplementedError # pragma: nocover
65
66
67class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
68 def __init__(self) -> None:
69 self._punctuation_count: int = 0
70 self._symbol_count: int = 0
71 self._character_count: int = 0
72
73 self._last_printable_char: str | None = None
74 self._frenzy_symbol_in_word: bool = False
75
76 def eligible(self, character: str) -> bool:
77 return character.isprintable()
78
79 def feed(self, character: str) -> None:
80 self._character_count += 1
81
82 if (
83 character != self._last_printable_char
84 and character not in COMMON_SAFE_ASCII_CHARACTERS
85 ):
86 if is_punctuation(character):
87 self._punctuation_count += 1
88 elif (
89 character.isdigit() is False
90 and is_symbol(character)
91 and is_emoticon(character) is False
92 ):
93 self._symbol_count += 2
94
95 self._last_printable_char = character
96
97 def reset(self) -> None: # Abstract
98 self._punctuation_count = 0
99 self._character_count = 0
100 self._symbol_count = 0
101
102 @property
103 def ratio(self) -> float:
104 if self._character_count == 0:
105 return 0.0
106
107 ratio_of_punctuation: float = (
108 self._punctuation_count + self._symbol_count
109 ) / self._character_count
110
111 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
112
113
114class TooManyAccentuatedPlugin(MessDetectorPlugin):
115 def __init__(self) -> None:
116 self._character_count: int = 0
117 self._accentuated_count: int = 0
118
119 def eligible(self, character: str) -> bool:
120 return character.isalpha()
121
122 def feed(self, character: str) -> None:
123 self._character_count += 1
124
125 if is_accentuated(character):
126 self._accentuated_count += 1
127
128 def reset(self) -> None: # Abstract
129 self._character_count = 0
130 self._accentuated_count = 0
131
132 @property
133 def ratio(self) -> float:
134 if self._character_count < 8:
135 return 0.0
136
137 ratio_of_accentuation: float = self._accentuated_count / self._character_count
138 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
139
140
141class UnprintablePlugin(MessDetectorPlugin):
142 def __init__(self) -> None:
143 self._unprintable_count: int = 0
144 self._character_count: int = 0
145
146 def eligible(self, character: str) -> bool:
147 return True
148
149 def feed(self, character: str) -> None:
150 if is_unprintable(character):
151 self._unprintable_count += 1
152 self._character_count += 1
153
154 def reset(self) -> None: # Abstract
155 self._unprintable_count = 0
156
157 @property
158 def ratio(self) -> float:
159 if self._character_count == 0:
160 return 0.0
161
162 return (self._unprintable_count * 8) / self._character_count
163
164
165class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
166 def __init__(self) -> None:
167 self._successive_count: int = 0
168 self._character_count: int = 0
169
170 self._last_latin_character: str | None = None
171
172 def eligible(self, character: str) -> bool:
173 return character.isalpha() and is_latin(character)
174
175 def feed(self, character: str) -> None:
176 self._character_count += 1
177 if (
178 self._last_latin_character is not None
179 and is_accentuated(character)
180 and is_accentuated(self._last_latin_character)
181 ):
182 if character.isupper() and self._last_latin_character.isupper():
183 self._successive_count += 1
184 # Worse if its the same char duplicated with different accent.
185 if remove_accent(character) == remove_accent(self._last_latin_character):
186 self._successive_count += 1
187 self._last_latin_character = character
188
189 def reset(self) -> None: # Abstract
190 self._successive_count = 0
191 self._character_count = 0
192 self._last_latin_character = None
193
194 @property
195 def ratio(self) -> float:
196 if self._character_count == 0:
197 return 0.0
198
199 return (self._successive_count * 2) / self._character_count
200
201
202class SuspiciousRange(MessDetectorPlugin):
203 def __init__(self) -> None:
204 self._suspicious_successive_range_count: int = 0
205 self._character_count: int = 0
206 self._last_printable_seen: str | None = None
207
208 def eligible(self, character: str) -> bool:
209 return character.isprintable()
210
211 def feed(self, character: str) -> None:
212 self._character_count += 1
213
214 if (
215 character.isspace()
216 or is_punctuation(character)
217 or character in COMMON_SAFE_ASCII_CHARACTERS
218 ):
219 self._last_printable_seen = None
220 return
221
222 if self._last_printable_seen is None:
223 self._last_printable_seen = character
224 return
225
226 unicode_range_a: str | None = unicode_range(self._last_printable_seen)
227 unicode_range_b: str | None = unicode_range(character)
228
229 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
230 self._suspicious_successive_range_count += 1
231
232 self._last_printable_seen = character
233
234 def reset(self) -> None: # Abstract
235 self._character_count = 0
236 self._suspicious_successive_range_count = 0
237 self._last_printable_seen = None
238
239 @property
240 def ratio(self) -> float:
241 if self._character_count <= 13:
242 return 0.0
243
244 ratio_of_suspicious_range_usage: float = (
245 self._suspicious_successive_range_count * 2
246 ) / self._character_count
247
248 return ratio_of_suspicious_range_usage
249
250
251class SuperWeirdWordPlugin(MessDetectorPlugin):
252 def __init__(self) -> None:
253 self._word_count: int = 0
254 self._bad_word_count: int = 0
255 self._foreign_long_count: int = 0
256
257 self._is_current_word_bad: bool = False
258 self._foreign_long_watch: bool = False
259
260 self._character_count: int = 0
261 self._bad_character_count: int = 0
262
263 self._buffer: str = ""
264 self._buffer_accent_count: int = 0
265 self._buffer_glyph_count: int = 0
266
267 def eligible(self, character: str) -> bool:
268 return True
269
270 def feed(self, character: str) -> None:
271 if character.isalpha():
272 self._buffer += character
273 if is_accentuated(character):
274 self._buffer_accent_count += 1
275 if (
276 self._foreign_long_watch is False
277 and (is_latin(character) is False or is_accentuated(character))
278 and is_cjk(character) is False
279 and is_hangul(character) is False
280 and is_katakana(character) is False
281 and is_hiragana(character) is False
282 and is_thai(character) is False
283 ):
284 self._foreign_long_watch = True
285 if (
286 is_cjk(character)
287 or is_hangul(character)
288 or is_katakana(character)
289 or is_hiragana(character)
290 or is_thai(character)
291 ):
292 self._buffer_glyph_count += 1
293 return
294 if not self._buffer:
295 return
296 if (
297 character.isspace() or is_punctuation(character) or is_separator(character)
298 ) and self._buffer:
299 self._word_count += 1
300 buffer_length: int = len(self._buffer)
301
302 self._character_count += buffer_length
303
304 if buffer_length >= 4:
305 if self._buffer_accent_count / buffer_length >= 0.5:
306 self._is_current_word_bad = True
307 # Word/Buffer ending with an upper case accentuated letter are so rare,
308 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
309 elif (
310 is_accentuated(self._buffer[-1])
311 and self._buffer[-1].isupper()
312 and all(_.isupper() for _ in self._buffer) is False
313 ):
314 self._foreign_long_count += 1
315 self._is_current_word_bad = True
316 elif self._buffer_glyph_count == 1:
317 self._is_current_word_bad = True
318 self._foreign_long_count += 1
319 if buffer_length >= 24 and self._foreign_long_watch:
320 camel_case_dst = [
321 i
322 for c, i in zip(self._buffer, range(0, buffer_length))
323 if c.isupper()
324 ]
325 probable_camel_cased: bool = False
326
327 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
328 probable_camel_cased = True
329
330 if not probable_camel_cased:
331 self._foreign_long_count += 1
332 self._is_current_word_bad = True
333
334 if self._is_current_word_bad:
335 self._bad_word_count += 1
336 self._bad_character_count += len(self._buffer)
337 self._is_current_word_bad = False
338
339 self._foreign_long_watch = False
340 self._buffer = ""
341 self._buffer_accent_count = 0
342 self._buffer_glyph_count = 0
343 elif (
344 character not in {"<", ">", "-", "=", "~", "|", "_"}
345 and character.isdigit() is False
346 and is_symbol(character)
347 ):
348 self._is_current_word_bad = True
349 self._buffer += character
350
351 def reset(self) -> None: # Abstract
352 self._buffer = ""
353 self._is_current_word_bad = False
354 self._foreign_long_watch = False
355 self._bad_word_count = 0
356 self._word_count = 0
357 self._character_count = 0
358 self._bad_character_count = 0
359 self._foreign_long_count = 0
360
361 @property
362 def ratio(self) -> float:
363 if self._word_count <= 10 and self._foreign_long_count == 0:
364 return 0.0
365
366 return self._bad_character_count / self._character_count
367
368
369class CjkUncommonPlugin(MessDetectorPlugin):
370 """
371 Detect messy CJK text that probably means nothing.
372 """
373
374 def __init__(self) -> None:
375 self._character_count: int = 0
376 self._uncommon_count: int = 0
377
378 def eligible(self, character: str) -> bool:
379 return is_cjk(character)
380
381 def feed(self, character: str) -> None:
382 self._character_count += 1
383
384 if is_cjk_uncommon(character):
385 self._uncommon_count += 1
386 return
387
388 def reset(self) -> None: # Abstract
389 self._character_count = 0
390 self._uncommon_count = 0
391
392 @property
393 def ratio(self) -> float:
394 if self._character_count < 8:
395 return 0.0
396
397 uncommon_form_usage: float = self._uncommon_count / self._character_count
398
399 # we can be pretty sure it's garbage when uncommon characters are widely
400 # used. otherwise it could just be traditional chinese for example.
401 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
402
403
404class ArchaicUpperLowerPlugin(MessDetectorPlugin):
405 def __init__(self) -> None:
406 self._buf: bool = False
407
408 self._character_count_since_last_sep: int = 0
409
410 self._successive_upper_lower_count: int = 0
411 self._successive_upper_lower_count_final: int = 0
412
413 self._character_count: int = 0
414
415 self._last_alpha_seen: str | None = None
416 self._current_ascii_only: bool = True
417
418 def eligible(self, character: str) -> bool:
419 return True
420
421 def feed(self, character: str) -> None:
422 is_concerned = character.isalpha() and is_case_variable(character)
423 chunk_sep = is_concerned is False
424
425 if chunk_sep and self._character_count_since_last_sep > 0:
426 if (
427 self._character_count_since_last_sep <= 64
428 and character.isdigit() is False
429 and self._current_ascii_only is False
430 ):
431 self._successive_upper_lower_count_final += (
432 self._successive_upper_lower_count
433 )
434
435 self._successive_upper_lower_count = 0
436 self._character_count_since_last_sep = 0
437 self._last_alpha_seen = None
438 self._buf = False
439 self._character_count += 1
440 self._current_ascii_only = True
441
442 return
443
444 if self._current_ascii_only is True and character.isascii() is False:
445 self._current_ascii_only = False
446
447 if self._last_alpha_seen is not None:
448 if (character.isupper() and self._last_alpha_seen.islower()) or (
449 character.islower() and self._last_alpha_seen.isupper()
450 ):
451 if self._buf is True:
452 self._successive_upper_lower_count += 2
453 self._buf = False
454 else:
455 self._buf = True
456 else:
457 self._buf = False
458
459 self._character_count += 1
460 self._character_count_since_last_sep += 1
461 self._last_alpha_seen = character
462
463 def reset(self) -> None: # Abstract
464 self._character_count = 0
465 self._character_count_since_last_sep = 0
466 self._successive_upper_lower_count = 0
467 self._successive_upper_lower_count_final = 0
468 self._last_alpha_seen = None
469 self._buf = False
470 self._current_ascii_only = True
471
472 @property
473 def ratio(self) -> float:
474 if self._character_count == 0:
475 return 0.0
476
477 return self._successive_upper_lower_count_final / self._character_count
478
479
480class ArabicIsolatedFormPlugin(MessDetectorPlugin):
481 def __init__(self) -> None:
482 self._character_count: int = 0
483 self._isolated_form_count: int = 0
484
485 def reset(self) -> None: # Abstract
486 self._character_count = 0
487 self._isolated_form_count = 0
488
489 def eligible(self, character: str) -> bool:
490 return is_arabic(character)
491
492 def feed(self, character: str) -> None:
493 self._character_count += 1
494
495 if is_arabic_isolated_form(character):
496 self._isolated_form_count += 1
497
498 @property
499 def ratio(self) -> float:
500 if self._character_count < 8:
501 return 0.0
502
503 isolated_form_usage: float = self._isolated_form_count / self._character_count
504
505 return isolated_form_usage
506
507
508@lru_cache(maxsize=1024)
509def is_suspiciously_successive_range(
510 unicode_range_a: str | None, unicode_range_b: str | None
511) -> bool:
512 """
513 Determine if two Unicode range seen next to each other can be considered as suspicious.
514 """
515 if unicode_range_a is None or unicode_range_b is None:
516 return True
517
518 if unicode_range_a == unicode_range_b:
519 return False
520
521 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
522 return False
523
524 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
525 return False
526
527 # Latin characters can be accompanied with a combining diacritical mark
528 # eg. Vietnamese.
529 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
530 "Combining" in unicode_range_a or "Combining" in unicode_range_b
531 ):
532 return False
533
534 keywords_range_a, keywords_range_b = (
535 unicode_range_a.split(" "),
536 unicode_range_b.split(" "),
537 )
538
539 for el in keywords_range_a:
540 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
541 continue
542 if el in keywords_range_b:
543 return False
544
545 # Japanese Exception
546 range_a_jp_chars, range_b_jp_chars = (
547 unicode_range_a
548 in (
549 "Hiragana",
550 "Katakana",
551 ),
552 unicode_range_b in ("Hiragana", "Katakana"),
553 )
554 if (range_a_jp_chars or range_b_jp_chars) and (
555 "CJK" in unicode_range_a or "CJK" in unicode_range_b
556 ):
557 return False
558 if range_a_jp_chars and range_b_jp_chars:
559 return False
560
561 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
562 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
563 return False
564 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
565 return False
566
567 # Chinese/Japanese use dedicated range for punctuation and/or separators.
568 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
569 unicode_range_a in ["Katakana", "Hiragana"]
570 and unicode_range_b in ["Katakana", "Hiragana"]
571 ):
572 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
573 return False
574 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
575 return False
576 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
577 return False
578
579 return True
580
581
582@lru_cache(maxsize=2048)
583def mess_ratio(
584 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
585) -> float:
586 """
587 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
588 """
589
590 detectors: list[MessDetectorPlugin] = [
591 md_class() for md_class in MessDetectorPlugin.__subclasses__()
592 ]
593
594 length: int = len(decoded_sequence) + 1
595
596 mean_mess_ratio: float = 0.0
597
598 if length < 512:
599 intermediary_mean_mess_ratio_calc: int = 32
600 elif length <= 1024:
601 intermediary_mean_mess_ratio_calc = 64
602 else:
603 intermediary_mean_mess_ratio_calc = 128
604
605 for character, index in zip(decoded_sequence + "\n", range(length)):
606 for detector in detectors:
607 if detector.eligible(character):
608 detector.feed(character)
609
610 if (
611 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
612 ) or index == length - 1:
613 mean_mess_ratio = sum(dt.ratio for dt in detectors)
614
615 if mean_mess_ratio >= maximum_threshold:
616 break
617
618 if debug:
619 logger = getLogger("charset_normalizer")
620
621 logger.log(
622 TRACE,
623 "Mess-detector extended-analysis start. "
624 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
625 f"maximum_threshold={maximum_threshold}",
626 )
627
628 if len(decoded_sequence) > 16:
629 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
630 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
631
632 for dt in detectors:
633 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
634
635 return round(mean_mess_ratio, 3)