Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/md.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

350 statements  

1from __future__ import annotations 

2 

3from functools import lru_cache 

4from logging import getLogger 

5 

6from .constant import ( 

7 COMMON_SAFE_ASCII_CHARACTERS, 

8 TRACE, 

9 UNICODE_SECONDARY_RANGE_KEYWORD, 

10) 

11from .utils import ( 

12 is_accentuated, 

13 is_arabic, 

14 is_arabic_isolated_form, 

15 is_case_variable, 

16 is_cjk, 

17 is_emoticon, 

18 is_hangul, 

19 is_hiragana, 

20 is_katakana, 

21 is_latin, 

22 is_punctuation, 

23 is_separator, 

24 is_symbol, 

25 is_thai, 

26 is_unprintable, 

27 remove_accent, 

28 unicode_range, 

29 is_cjk_uncommon, 

30) 

31 

32 

33class MessDetectorPlugin: 

34 """ 

35 Base abstract class used for mess detection plugins. 

36 All detectors MUST extend and implement given methods. 

37 """ 

38 

39 def eligible(self, character: str) -> bool: 

40 """ 

41 Determine if given character should be fed in. 

42 """ 

43 raise NotImplementedError # pragma: nocover 

44 

45 def feed(self, character: str) -> None: 

46 """ 

47 The main routine to be executed upon character. 

48 Insert the logic in witch the text would be considered chaotic. 

49 """ 

50 raise NotImplementedError # pragma: nocover 

51 

52 def reset(self) -> None: # pragma: no cover 

53 """ 

54 Permit to reset the plugin to the initial state. 

55 """ 

56 raise NotImplementedError 

57 

58 @property 

59 def ratio(self) -> float: 

60 """ 

61 Compute the chaos ratio based on what your feed() has seen. 

62 Must NOT be lower than 0.; No restriction gt 0. 

63 """ 

64 raise NotImplementedError # pragma: nocover 

65 

66 

67class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): 

68 def __init__(self) -> None: 

69 self._punctuation_count: int = 0 

70 self._symbol_count: int = 0 

71 self._character_count: int = 0 

72 

73 self._last_printable_char: str | None = None 

74 self._frenzy_symbol_in_word: bool = False 

75 

76 def eligible(self, character: str) -> bool: 

77 return character.isprintable() 

78 

79 def feed(self, character: str) -> None: 

80 self._character_count += 1 

81 

82 if ( 

83 character != self._last_printable_char 

84 and character not in COMMON_SAFE_ASCII_CHARACTERS 

85 ): 

86 if is_punctuation(character): 

87 self._punctuation_count += 1 

88 elif ( 

89 character.isdigit() is False 

90 and is_symbol(character) 

91 and is_emoticon(character) is False 

92 ): 

93 self._symbol_count += 2 

94 

95 self._last_printable_char = character 

96 

97 def reset(self) -> None: # Abstract 

98 self._punctuation_count = 0 

99 self._character_count = 0 

100 self._symbol_count = 0 

101 

102 @property 

103 def ratio(self) -> float: 

104 if self._character_count == 0: 

105 return 0.0 

106 

107 ratio_of_punctuation: float = ( 

108 self._punctuation_count + self._symbol_count 

109 ) / self._character_count 

110 

111 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 

112 

113 

114class TooManyAccentuatedPlugin(MessDetectorPlugin): 

115 def __init__(self) -> None: 

116 self._character_count: int = 0 

117 self._accentuated_count: int = 0 

118 

119 def eligible(self, character: str) -> bool: 

120 return character.isalpha() 

121 

122 def feed(self, character: str) -> None: 

123 self._character_count += 1 

124 

125 if is_accentuated(character): 

126 self._accentuated_count += 1 

127 

128 def reset(self) -> None: # Abstract 

129 self._character_count = 0 

130 self._accentuated_count = 0 

131 

132 @property 

133 def ratio(self) -> float: 

134 if self._character_count < 8: 

135 return 0.0 

136 

137 ratio_of_accentuation: float = self._accentuated_count / self._character_count 

138 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 

139 

140 

141class UnprintablePlugin(MessDetectorPlugin): 

142 def __init__(self) -> None: 

143 self._unprintable_count: int = 0 

144 self._character_count: int = 0 

145 

146 def eligible(self, character: str) -> bool: 

147 return True 

148 

149 def feed(self, character: str) -> None: 

150 if is_unprintable(character): 

151 self._unprintable_count += 1 

152 self._character_count += 1 

153 

154 def reset(self) -> None: # Abstract 

155 self._unprintable_count = 0 

156 

157 @property 

158 def ratio(self) -> float: 

159 if self._character_count == 0: 

160 return 0.0 

161 

162 return (self._unprintable_count * 8) / self._character_count 

163 

164 

165class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): 

166 def __init__(self) -> None: 

167 self._successive_count: int = 0 

168 self._character_count: int = 0 

169 

170 self._last_latin_character: str | None = None 

171 

172 def eligible(self, character: str) -> bool: 

173 return character.isalpha() and is_latin(character) 

174 

175 def feed(self, character: str) -> None: 

176 self._character_count += 1 

177 if ( 

178 self._last_latin_character is not None 

179 and is_accentuated(character) 

180 and is_accentuated(self._last_latin_character) 

181 ): 

182 if character.isupper() and self._last_latin_character.isupper(): 

183 self._successive_count += 1 

184 # Worse if its the same char duplicated with different accent. 

185 if remove_accent(character) == remove_accent(self._last_latin_character): 

186 self._successive_count += 1 

187 self._last_latin_character = character 

188 

189 def reset(self) -> None: # Abstract 

190 self._successive_count = 0 

191 self._character_count = 0 

192 self._last_latin_character = None 

193 

194 @property 

195 def ratio(self) -> float: 

196 if self._character_count == 0: 

197 return 0.0 

198 

199 return (self._successive_count * 2) / self._character_count 

200 

201 

202class SuspiciousRange(MessDetectorPlugin): 

203 def __init__(self) -> None: 

204 self._suspicious_successive_range_count: int = 0 

205 self._character_count: int = 0 

206 self._last_printable_seen: str | None = None 

207 

208 def eligible(self, character: str) -> bool: 

209 return character.isprintable() 

210 

211 def feed(self, character: str) -> None: 

212 self._character_count += 1 

213 

214 if ( 

215 character.isspace() 

216 or is_punctuation(character) 

217 or character in COMMON_SAFE_ASCII_CHARACTERS 

218 ): 

219 self._last_printable_seen = None 

220 return 

221 

222 if self._last_printable_seen is None: 

223 self._last_printable_seen = character 

224 return 

225 

226 unicode_range_a: str | None = unicode_range(self._last_printable_seen) 

227 unicode_range_b: str | None = unicode_range(character) 

228 

229 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): 

230 self._suspicious_successive_range_count += 1 

231 

232 self._last_printable_seen = character 

233 

234 def reset(self) -> None: # Abstract 

235 self._character_count = 0 

236 self._suspicious_successive_range_count = 0 

237 self._last_printable_seen = None 

238 

239 @property 

240 def ratio(self) -> float: 

241 if self._character_count <= 13: 

242 return 0.0 

243 

244 ratio_of_suspicious_range_usage: float = ( 

245 self._suspicious_successive_range_count * 2 

246 ) / self._character_count 

247 

248 return ratio_of_suspicious_range_usage 

249 

250 

251class SuperWeirdWordPlugin(MessDetectorPlugin): 

252 def __init__(self) -> None: 

253 self._word_count: int = 0 

254 self._bad_word_count: int = 0 

255 self._foreign_long_count: int = 0 

256 

257 self._is_current_word_bad: bool = False 

258 self._foreign_long_watch: bool = False 

259 

260 self._character_count: int = 0 

261 self._bad_character_count: int = 0 

262 

263 self._buffer: str = "" 

264 self._buffer_accent_count: int = 0 

265 self._buffer_glyph_count: int = 0 

266 

267 def eligible(self, character: str) -> bool: 

268 return True 

269 

270 def feed(self, character: str) -> None: 

271 if character.isalpha(): 

272 self._buffer += character 

273 if is_accentuated(character): 

274 self._buffer_accent_count += 1 

275 if ( 

276 self._foreign_long_watch is False 

277 and (is_latin(character) is False or is_accentuated(character)) 

278 and is_cjk(character) is False 

279 and is_hangul(character) is False 

280 and is_katakana(character) is False 

281 and is_hiragana(character) is False 

282 and is_thai(character) is False 

283 ): 

284 self._foreign_long_watch = True 

285 if ( 

286 is_cjk(character) 

287 or is_hangul(character) 

288 or is_katakana(character) 

289 or is_hiragana(character) 

290 or is_thai(character) 

291 ): 

292 self._buffer_glyph_count += 1 

293 return 

294 if not self._buffer: 

295 return 

296 if ( 

297 character.isspace() or is_punctuation(character) or is_separator(character) 

298 ) and self._buffer: 

299 self._word_count += 1 

300 buffer_length: int = len(self._buffer) 

301 

302 self._character_count += buffer_length 

303 

304 if buffer_length >= 4: 

305 if self._buffer_accent_count / buffer_length >= 0.5: 

306 self._is_current_word_bad = True 

307 # Word/Buffer ending with an upper case accentuated letter are so rare, 

308 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. 

309 elif ( 

310 is_accentuated(self._buffer[-1]) 

311 and self._buffer[-1].isupper() 

312 and all(_.isupper() for _ in self._buffer) is False 

313 ): 

314 self._foreign_long_count += 1 

315 self._is_current_word_bad = True 

316 elif self._buffer_glyph_count == 1: 

317 self._is_current_word_bad = True 

318 self._foreign_long_count += 1 

319 if buffer_length >= 24 and self._foreign_long_watch: 

320 camel_case_dst = [ 

321 i 

322 for c, i in zip(self._buffer, range(0, buffer_length)) 

323 if c.isupper() 

324 ] 

325 probable_camel_cased: bool = False 

326 

327 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): 

328 probable_camel_cased = True 

329 

330 if not probable_camel_cased: 

331 self._foreign_long_count += 1 

332 self._is_current_word_bad = True 

333 

334 if self._is_current_word_bad: 

335 self._bad_word_count += 1 

336 self._bad_character_count += len(self._buffer) 

337 self._is_current_word_bad = False 

338 

339 self._foreign_long_watch = False 

340 self._buffer = "" 

341 self._buffer_accent_count = 0 

342 self._buffer_glyph_count = 0 

343 elif ( 

344 character not in {"<", ">", "-", "=", "~", "|", "_"} 

345 and character.isdigit() is False 

346 and is_symbol(character) 

347 ): 

348 self._is_current_word_bad = True 

349 self._buffer += character 

350 

351 def reset(self) -> None: # Abstract 

352 self._buffer = "" 

353 self._is_current_word_bad = False 

354 self._foreign_long_watch = False 

355 self._bad_word_count = 0 

356 self._word_count = 0 

357 self._character_count = 0 

358 self._bad_character_count = 0 

359 self._foreign_long_count = 0 

360 

361 @property 

362 def ratio(self) -> float: 

363 if self._word_count <= 10 and self._foreign_long_count == 0: 

364 return 0.0 

365 

366 return self._bad_character_count / self._character_count 

367 

368 

369class CjkUncommonPlugin(MessDetectorPlugin): 

370 """ 

371 Detect messy CJK text that probably means nothing. 

372 """ 

373 

374 def __init__(self) -> None: 

375 self._character_count: int = 0 

376 self._uncommon_count: int = 0 

377 

378 def eligible(self, character: str) -> bool: 

379 return is_cjk(character) 

380 

381 def feed(self, character: str) -> None: 

382 self._character_count += 1 

383 

384 if is_cjk_uncommon(character): 

385 self._uncommon_count += 1 

386 return 

387 

388 def reset(self) -> None: # Abstract 

389 self._character_count = 0 

390 self._uncommon_count = 0 

391 

392 @property 

393 def ratio(self) -> float: 

394 if self._character_count < 8: 

395 return 0.0 

396 

397 uncommon_form_usage: float = self._uncommon_count / self._character_count 

398 

399 # we can be pretty sure it's garbage when uncommon characters are widely 

400 # used. otherwise it could just be traditional chinese for example. 

401 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0 

402 

403 

404class ArchaicUpperLowerPlugin(MessDetectorPlugin): 

405 def __init__(self) -> None: 

406 self._buf: bool = False 

407 

408 self._character_count_since_last_sep: int = 0 

409 

410 self._successive_upper_lower_count: int = 0 

411 self._successive_upper_lower_count_final: int = 0 

412 

413 self._character_count: int = 0 

414 

415 self._last_alpha_seen: str | None = None 

416 self._current_ascii_only: bool = True 

417 

418 def eligible(self, character: str) -> bool: 

419 return True 

420 

421 def feed(self, character: str) -> None: 

422 is_concerned = character.isalpha() and is_case_variable(character) 

423 chunk_sep = is_concerned is False 

424 

425 if chunk_sep and self._character_count_since_last_sep > 0: 

426 if ( 

427 self._character_count_since_last_sep <= 64 

428 and character.isdigit() is False 

429 and self._current_ascii_only is False 

430 ): 

431 self._successive_upper_lower_count_final += ( 

432 self._successive_upper_lower_count 

433 ) 

434 

435 self._successive_upper_lower_count = 0 

436 self._character_count_since_last_sep = 0 

437 self._last_alpha_seen = None 

438 self._buf = False 

439 self._character_count += 1 

440 self._current_ascii_only = True 

441 

442 return 

443 

444 if self._current_ascii_only is True and character.isascii() is False: 

445 self._current_ascii_only = False 

446 

447 if self._last_alpha_seen is not None: 

448 if (character.isupper() and self._last_alpha_seen.islower()) or ( 

449 character.islower() and self._last_alpha_seen.isupper() 

450 ): 

451 if self._buf is True: 

452 self._successive_upper_lower_count += 2 

453 self._buf = False 

454 else: 

455 self._buf = True 

456 else: 

457 self._buf = False 

458 

459 self._character_count += 1 

460 self._character_count_since_last_sep += 1 

461 self._last_alpha_seen = character 

462 

463 def reset(self) -> None: # Abstract 

464 self._character_count = 0 

465 self._character_count_since_last_sep = 0 

466 self._successive_upper_lower_count = 0 

467 self._successive_upper_lower_count_final = 0 

468 self._last_alpha_seen = None 

469 self._buf = False 

470 self._current_ascii_only = True 

471 

472 @property 

473 def ratio(self) -> float: 

474 if self._character_count == 0: 

475 return 0.0 

476 

477 return self._successive_upper_lower_count_final / self._character_count 

478 

479 

480class ArabicIsolatedFormPlugin(MessDetectorPlugin): 

481 def __init__(self) -> None: 

482 self._character_count: int = 0 

483 self._isolated_form_count: int = 0 

484 

485 def reset(self) -> None: # Abstract 

486 self._character_count = 0 

487 self._isolated_form_count = 0 

488 

489 def eligible(self, character: str) -> bool: 

490 return is_arabic(character) 

491 

492 def feed(self, character: str) -> None: 

493 self._character_count += 1 

494 

495 if is_arabic_isolated_form(character): 

496 self._isolated_form_count += 1 

497 

498 @property 

499 def ratio(self) -> float: 

500 if self._character_count < 8: 

501 return 0.0 

502 

503 isolated_form_usage: float = self._isolated_form_count / self._character_count 

504 

505 return isolated_form_usage 

506 

507 

508@lru_cache(maxsize=1024) 

509def is_suspiciously_successive_range( 

510 unicode_range_a: str | None, unicode_range_b: str | None 

511) -> bool: 

512 """ 

513 Determine if two Unicode range seen next to each other can be considered as suspicious. 

514 """ 

515 if unicode_range_a is None or unicode_range_b is None: 

516 return True 

517 

518 if unicode_range_a == unicode_range_b: 

519 return False 

520 

521 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: 

522 return False 

523 

524 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: 

525 return False 

526 

527 # Latin characters can be accompanied with a combining diacritical mark 

528 # eg. Vietnamese. 

529 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( 

530 "Combining" in unicode_range_a or "Combining" in unicode_range_b 

531 ): 

532 return False 

533 

534 keywords_range_a, keywords_range_b = ( 

535 unicode_range_a.split(" "), 

536 unicode_range_b.split(" "), 

537 ) 

538 

539 for el in keywords_range_a: 

540 if el in UNICODE_SECONDARY_RANGE_KEYWORD: 

541 continue 

542 if el in keywords_range_b: 

543 return False 

544 

545 # Japanese Exception 

546 range_a_jp_chars, range_b_jp_chars = ( 

547 unicode_range_a 

548 in ( 

549 "Hiragana", 

550 "Katakana", 

551 ), 

552 unicode_range_b in ("Hiragana", "Katakana"), 

553 ) 

554 if (range_a_jp_chars or range_b_jp_chars) and ( 

555 "CJK" in unicode_range_a or "CJK" in unicode_range_b 

556 ): 

557 return False 

558 if range_a_jp_chars and range_b_jp_chars: 

559 return False 

560 

561 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: 

562 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: 

563 return False 

564 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

565 return False 

566 

567 # Chinese/Japanese use dedicated range for punctuation and/or separators. 

568 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( 

569 unicode_range_a in ["Katakana", "Hiragana"] 

570 and unicode_range_b in ["Katakana", "Hiragana"] 

571 ): 

572 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: 

573 return False 

574 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: 

575 return False 

576 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

577 return False 

578 

579 return True 

580 

581 

582@lru_cache(maxsize=2048) 

583def mess_ratio( 

584 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False 

585) -> float: 

586 """ 

587 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. 

588 """ 

589 

590 detectors: list[MessDetectorPlugin] = [ 

591 md_class() for md_class in MessDetectorPlugin.__subclasses__() 

592 ] 

593 

594 length: int = len(decoded_sequence) + 1 

595 

596 mean_mess_ratio: float = 0.0 

597 

598 if length < 512: 

599 intermediary_mean_mess_ratio_calc: int = 32 

600 elif length <= 1024: 

601 intermediary_mean_mess_ratio_calc = 64 

602 else: 

603 intermediary_mean_mess_ratio_calc = 128 

604 

605 for character, index in zip(decoded_sequence + "\n", range(length)): 

606 for detector in detectors: 

607 if detector.eligible(character): 

608 detector.feed(character) 

609 

610 if ( 

611 index > 0 and index % intermediary_mean_mess_ratio_calc == 0 

612 ) or index == length - 1: 

613 mean_mess_ratio = sum(dt.ratio for dt in detectors) 

614 

615 if mean_mess_ratio >= maximum_threshold: 

616 break 

617 

618 if debug: 

619 logger = getLogger("charset_normalizer") 

620 

621 logger.log( 

622 TRACE, 

623 "Mess-detector extended-analysis start. " 

624 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " 

625 f"maximum_threshold={maximum_threshold}", 

626 ) 

627 

628 if len(decoded_sequence) > 16: 

629 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") 

630 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") 

631 

632 for dt in detectors: 

633 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") 

634 

635 return round(mean_mess_ratio, 3)