Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/md.py: 23%
301 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:40 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:40 +0000
1from functools import lru_cache
2from logging import getLogger
3from typing import List, Optional
5from .constant import (
6 COMMON_SAFE_ASCII_CHARACTERS,
7 TRACE,
8 UNICODE_SECONDARY_RANGE_KEYWORD,
9)
10from .utils import (
11 is_accentuated,
12 is_arabic,
13 is_arabic_isolated_form,
14 is_case_variable,
15 is_cjk,
16 is_emoticon,
17 is_hangul,
18 is_hiragana,
19 is_katakana,
20 is_latin,
21 is_punctuation,
22 is_separator,
23 is_symbol,
24 is_thai,
25 is_unprintable,
26 remove_accent,
27 unicode_range,
28)
31class MessDetectorPlugin:
32 """
33 Base abstract class used for mess detection plugins.
34 All detectors MUST extend and implement given methods.
35 """
37 def eligible(self, character: str) -> bool:
38 """
39 Determine if given character should be fed in.
40 """
41 raise NotImplementedError # pragma: nocover
43 def feed(self, character: str) -> None:
44 """
45 The main routine to be executed upon character.
46 Insert the logic in witch the text would be considered chaotic.
47 """
48 raise NotImplementedError # pragma: nocover
50 def reset(self) -> None: # pragma: no cover
51 """
52 Permit to reset the plugin to the initial state.
53 """
54 raise NotImplementedError
56 @property
57 def ratio(self) -> float:
58 """
59 Compute the chaos ratio based on what your feed() has seen.
60 Must NOT be lower than 0.; No restriction gt 0.
61 """
62 raise NotImplementedError # pragma: nocover
65class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
66 def __init__(self) -> None:
67 self._punctuation_count: int = 0
68 self._symbol_count: int = 0
69 self._character_count: int = 0
71 self._last_printable_char: Optional[str] = None
72 self._frenzy_symbol_in_word: bool = False
74 def eligible(self, character: str) -> bool:
75 return character.isprintable()
77 def feed(self, character: str) -> None:
78 self._character_count += 1
80 if (
81 character != self._last_printable_char
82 and character not in COMMON_SAFE_ASCII_CHARACTERS
83 ):
84 if is_punctuation(character):
85 self._punctuation_count += 1
86 elif (
87 character.isdigit() is False
88 and is_symbol(character)
89 and is_emoticon(character) is False
90 ):
91 self._symbol_count += 2
93 self._last_printable_char = character
95 def reset(self) -> None: # pragma: no cover
96 self._punctuation_count = 0
97 self._character_count = 0
98 self._symbol_count = 0
100 @property
101 def ratio(self) -> float:
102 if self._character_count == 0:
103 return 0.0
105 ratio_of_punctuation: float = (
106 self._punctuation_count + self._symbol_count
107 ) / self._character_count
109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
112class TooManyAccentuatedPlugin(MessDetectorPlugin):
113 def __init__(self) -> None:
114 self._character_count: int = 0
115 self._accentuated_count: int = 0
117 def eligible(self, character: str) -> bool:
118 return character.isalpha()
120 def feed(self, character: str) -> None:
121 self._character_count += 1
123 if is_accentuated(character):
124 self._accentuated_count += 1
126 def reset(self) -> None: # pragma: no cover
127 self._character_count = 0
128 self._accentuated_count = 0
130 @property
131 def ratio(self) -> float:
132 if self._character_count < 8:
133 return 0.0
135 ratio_of_accentuation: float = self._accentuated_count / self._character_count
136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
139class UnprintablePlugin(MessDetectorPlugin):
140 def __init__(self) -> None:
141 self._unprintable_count: int = 0
142 self._character_count: int = 0
144 def eligible(self, character: str) -> bool:
145 return True
147 def feed(self, character: str) -> None:
148 if is_unprintable(character):
149 self._unprintable_count += 1
150 self._character_count += 1
152 def reset(self) -> None: # pragma: no cover
153 self._unprintable_count = 0
155 @property
156 def ratio(self) -> float:
157 if self._character_count == 0:
158 return 0.0
160 return (self._unprintable_count * 8) / self._character_count
163class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
164 def __init__(self) -> None:
165 self._successive_count: int = 0
166 self._character_count: int = 0
168 self._last_latin_character: Optional[str] = None
170 def eligible(self, character: str) -> bool:
171 return character.isalpha() and is_latin(character)
173 def feed(self, character: str) -> None:
174 self._character_count += 1
175 if (
176 self._last_latin_character is not None
177 and is_accentuated(character)
178 and is_accentuated(self._last_latin_character)
179 ):
180 if character.isupper() and self._last_latin_character.isupper():
181 self._successive_count += 1
182 # Worse if its the same char duplicated with different accent.
183 if remove_accent(character) == remove_accent(self._last_latin_character):
184 self._successive_count += 1
185 self._last_latin_character = character
187 def reset(self) -> None: # pragma: no cover
188 self._successive_count = 0
189 self._character_count = 0
190 self._last_latin_character = None
192 @property
193 def ratio(self) -> float:
194 if self._character_count == 0:
195 return 0.0
197 return (self._successive_count * 2) / self._character_count
200class SuspiciousRange(MessDetectorPlugin):
201 def __init__(self) -> None:
202 self._suspicious_successive_range_count: int = 0
203 self._character_count: int = 0
204 self._last_printable_seen: Optional[str] = None
206 def eligible(self, character: str) -> bool:
207 return character.isprintable()
209 def feed(self, character: str) -> None:
210 self._character_count += 1
212 if (
213 character.isspace()
214 or is_punctuation(character)
215 or character in COMMON_SAFE_ASCII_CHARACTERS
216 ):
217 self._last_printable_seen = None
218 return
220 if self._last_printable_seen is None:
221 self._last_printable_seen = character
222 return
224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
225 unicode_range_b: Optional[str] = unicode_range(character)
227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
228 self._suspicious_successive_range_count += 1
230 self._last_printable_seen = character
232 def reset(self) -> None: # pragma: no cover
233 self._character_count = 0
234 self._suspicious_successive_range_count = 0
235 self._last_printable_seen = None
237 @property
238 def ratio(self) -> float:
239 if self._character_count <= 24:
240 return 0.0
242 ratio_of_suspicious_range_usage: float = (
243 self._suspicious_successive_range_count * 2
244 ) / self._character_count
246 return ratio_of_suspicious_range_usage
249class SuperWeirdWordPlugin(MessDetectorPlugin):
250 def __init__(self) -> None:
251 self._word_count: int = 0
252 self._bad_word_count: int = 0
253 self._foreign_long_count: int = 0
255 self._is_current_word_bad: bool = False
256 self._foreign_long_watch: bool = False
258 self._character_count: int = 0
259 self._bad_character_count: int = 0
261 self._buffer: str = ""
262 self._buffer_accent_count: int = 0
264 def eligible(self, character: str) -> bool:
265 return True
267 def feed(self, character: str) -> None:
268 if character.isalpha():
269 self._buffer += character
270 if is_accentuated(character):
271 self._buffer_accent_count += 1
272 if (
273 self._foreign_long_watch is False
274 and (is_latin(character) is False or is_accentuated(character))
275 and is_cjk(character) is False
276 and is_hangul(character) is False
277 and is_katakana(character) is False
278 and is_hiragana(character) is False
279 and is_thai(character) is False
280 ):
281 self._foreign_long_watch = True
282 return
283 if not self._buffer:
284 return
285 if (
286 character.isspace() or is_punctuation(character) or is_separator(character)
287 ) and self._buffer:
288 self._word_count += 1
289 buffer_length: int = len(self._buffer)
291 self._character_count += buffer_length
293 if buffer_length >= 4:
294 if self._buffer_accent_count / buffer_length > 0.34:
295 self._is_current_word_bad = True
296 # Word/Buffer ending with an upper case accentuated letter are so rare,
297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
298 if (
299 is_accentuated(self._buffer[-1])
300 and self._buffer[-1].isupper()
301 and all(_.isupper() for _ in self._buffer) is False
302 ):
303 self._foreign_long_count += 1
304 self._is_current_word_bad = True
305 if buffer_length >= 24 and self._foreign_long_watch:
306 camel_case_dst = [
307 i
308 for c, i in zip(self._buffer, range(0, buffer_length))
309 if c.isupper()
310 ]
311 probable_camel_cased: bool = False
313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
314 probable_camel_cased = True
316 if not probable_camel_cased:
317 self._foreign_long_count += 1
318 self._is_current_word_bad = True
320 if self._is_current_word_bad:
321 self._bad_word_count += 1
322 self._bad_character_count += len(self._buffer)
323 self._is_current_word_bad = False
325 self._foreign_long_watch = False
326 self._buffer = ""
327 self._buffer_accent_count = 0
328 elif (
329 character not in {"<", ">", "-", "=", "~", "|", "_"}
330 and character.isdigit() is False
331 and is_symbol(character)
332 ):
333 self._is_current_word_bad = True
334 self._buffer += character
336 def reset(self) -> None: # pragma: no cover
337 self._buffer = ""
338 self._is_current_word_bad = False
339 self._foreign_long_watch = False
340 self._bad_word_count = 0
341 self._word_count = 0
342 self._character_count = 0
343 self._bad_character_count = 0
344 self._foreign_long_count = 0
346 @property
347 def ratio(self) -> float:
348 if self._word_count <= 10 and self._foreign_long_count == 0:
349 return 0.0
351 return self._bad_character_count / self._character_count
354class CjkInvalidStopPlugin(MessDetectorPlugin):
355 """
356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
357 can be easily detected. Searching for the overuse of '丅' and '丄'.
358 """
360 def __init__(self) -> None:
361 self._wrong_stop_count: int = 0
362 self._cjk_character_count: int = 0
364 def eligible(self, character: str) -> bool:
365 return True
367 def feed(self, character: str) -> None:
368 if character in {"丅", "丄"}:
369 self._wrong_stop_count += 1
370 return
371 if is_cjk(character):
372 self._cjk_character_count += 1
374 def reset(self) -> None: # pragma: no cover
375 self._wrong_stop_count = 0
376 self._cjk_character_count = 0
378 @property
379 def ratio(self) -> float:
380 if self._cjk_character_count < 16:
381 return 0.0
382 return self._wrong_stop_count / self._cjk_character_count
385class ArchaicUpperLowerPlugin(MessDetectorPlugin):
386 def __init__(self) -> None:
387 self._buf: bool = False
389 self._character_count_since_last_sep: int = 0
391 self._successive_upper_lower_count: int = 0
392 self._successive_upper_lower_count_final: int = 0
394 self._character_count: int = 0
396 self._last_alpha_seen: Optional[str] = None
397 self._current_ascii_only: bool = True
399 def eligible(self, character: str) -> bool:
400 return True
402 def feed(self, character: str) -> None:
403 is_concerned = character.isalpha() and is_case_variable(character)
404 chunk_sep = is_concerned is False
406 if chunk_sep and self._character_count_since_last_sep > 0:
407 if (
408 self._character_count_since_last_sep <= 64
409 and character.isdigit() is False
410 and self._current_ascii_only is False
411 ):
412 self._successive_upper_lower_count_final += (
413 self._successive_upper_lower_count
414 )
416 self._successive_upper_lower_count = 0
417 self._character_count_since_last_sep = 0
418 self._last_alpha_seen = None
419 self._buf = False
420 self._character_count += 1
421 self._current_ascii_only = True
423 return
425 if self._current_ascii_only is True and character.isascii() is False:
426 self._current_ascii_only = False
428 if self._last_alpha_seen is not None:
429 if (character.isupper() and self._last_alpha_seen.islower()) or (
430 character.islower() and self._last_alpha_seen.isupper()
431 ):
432 if self._buf is True:
433 self._successive_upper_lower_count += 2
434 self._buf = False
435 else:
436 self._buf = True
437 else:
438 self._buf = False
440 self._character_count += 1
441 self._character_count_since_last_sep += 1
442 self._last_alpha_seen = character
444 def reset(self) -> None: # pragma: no cover
445 self._character_count = 0
446 self._character_count_since_last_sep = 0
447 self._successive_upper_lower_count = 0
448 self._successive_upper_lower_count_final = 0
449 self._last_alpha_seen = None
450 self._buf = False
451 self._current_ascii_only = True
453 @property
454 def ratio(self) -> float:
455 if self._character_count == 0:
456 return 0.0
458 return self._successive_upper_lower_count_final / self._character_count
461class ArabicIsolatedFormPlugin(MessDetectorPlugin):
462 def __init__(self) -> None:
463 self._character_count: int = 0
464 self._isolated_form_count: int = 0
466 def reset(self) -> None: # pragma: no cover
467 self._character_count = 0
468 self._isolated_form_count = 0
470 def eligible(self, character: str) -> bool:
471 return is_arabic(character)
473 def feed(self, character: str) -> None:
474 self._character_count += 1
476 if is_arabic_isolated_form(character):
477 self._isolated_form_count += 1
479 @property
480 def ratio(self) -> float:
481 if self._character_count < 8:
482 return 0.0
484 isolated_form_usage: float = self._isolated_form_count / self._character_count
486 return isolated_form_usage
489@lru_cache(maxsize=1024)
490def is_suspiciously_successive_range(
491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
492) -> bool:
493 """
494 Determine if two Unicode range seen next to each other can be considered as suspicious.
495 """
496 if unicode_range_a is None or unicode_range_b is None:
497 return True
499 if unicode_range_a == unicode_range_b:
500 return False
502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
503 return False
505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
506 return False
508 # Latin characters can be accompanied with a combining diacritical mark
509 # eg. Vietnamese.
510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
511 "Combining" in unicode_range_a or "Combining" in unicode_range_b
512 ):
513 return False
515 keywords_range_a, keywords_range_b = unicode_range_a.split(
516 " "
517 ), unicode_range_b.split(" ")
519 for el in keywords_range_a:
520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
521 continue
522 if el in keywords_range_b:
523 return False
525 # Japanese Exception
526 range_a_jp_chars, range_b_jp_chars = (
527 unicode_range_a
528 in (
529 "Hiragana",
530 "Katakana",
531 ),
532 unicode_range_b in ("Hiragana", "Katakana"),
533 )
534 if (range_a_jp_chars or range_b_jp_chars) and (
535 "CJK" in unicode_range_a or "CJK" in unicode_range_b
536 ):
537 return False
538 if range_a_jp_chars and range_b_jp_chars:
539 return False
541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
543 return False
544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
545 return False
547 # Chinese/Japanese use dedicated range for punctuation and/or separators.
548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
549 unicode_range_a in ["Katakana", "Hiragana"]
550 and unicode_range_b in ["Katakana", "Hiragana"]
551 ):
552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
553 return False
554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
555 return False
556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
557 return False
559 return True
562@lru_cache(maxsize=2048)
563def mess_ratio(
564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
565) -> float:
566 """
567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
568 """
570 detectors: List[MessDetectorPlugin] = [
571 md_class() for md_class in MessDetectorPlugin.__subclasses__()
572 ]
574 length: int = len(decoded_sequence) + 1
576 mean_mess_ratio: float = 0.0
578 if length < 512:
579 intermediary_mean_mess_ratio_calc: int = 32
580 elif length <= 1024:
581 intermediary_mean_mess_ratio_calc = 64
582 else:
583 intermediary_mean_mess_ratio_calc = 128
585 for character, index in zip(decoded_sequence + "\n", range(length)):
586 for detector in detectors:
587 if detector.eligible(character):
588 detector.feed(character)
590 if (
591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
592 ) or index == length - 1:
593 mean_mess_ratio = sum(dt.ratio for dt in detectors)
595 if mean_mess_ratio >= maximum_threshold:
596 break
598 if debug:
599 logger = getLogger("charset_normalizer")
601 logger.log(
602 TRACE,
603 "Mess-detector extended-analysis start. "
604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
605 f"maximum_threshold={maximum_threshold}",
606 )
608 if len(decoded_sequence) > 16:
609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
612 for dt in detectors: # pragma: nocover
613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
615 return round(mean_mess_ratio, 3)