Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/md.py: 23%

301 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:40 +0000

1from functools import lru_cache 

2from logging import getLogger 

3from typing import List, Optional 

4 

5from .constant import ( 

6 COMMON_SAFE_ASCII_CHARACTERS, 

7 TRACE, 

8 UNICODE_SECONDARY_RANGE_KEYWORD, 

9) 

10from .utils import ( 

11 is_accentuated, 

12 is_arabic, 

13 is_arabic_isolated_form, 

14 is_case_variable, 

15 is_cjk, 

16 is_emoticon, 

17 is_hangul, 

18 is_hiragana, 

19 is_katakana, 

20 is_latin, 

21 is_punctuation, 

22 is_separator, 

23 is_symbol, 

24 is_thai, 

25 is_unprintable, 

26 remove_accent, 

27 unicode_range, 

28) 

29 

30 

31class MessDetectorPlugin: 

32 """ 

33 Base abstract class used for mess detection plugins. 

34 All detectors MUST extend and implement given methods. 

35 """ 

36 

37 def eligible(self, character: str) -> bool: 

38 """ 

39 Determine if given character should be fed in. 

40 """ 

41 raise NotImplementedError # pragma: nocover 

42 

43 def feed(self, character: str) -> None: 

44 """ 

45 The main routine to be executed upon character. 

46 Insert the logic in witch the text would be considered chaotic. 

47 """ 

48 raise NotImplementedError # pragma: nocover 

49 

50 def reset(self) -> None: # pragma: no cover 

51 """ 

52 Permit to reset the plugin to the initial state. 

53 """ 

54 raise NotImplementedError 

55 

56 @property 

57 def ratio(self) -> float: 

58 """ 

59 Compute the chaos ratio based on what your feed() has seen. 

60 Must NOT be lower than 0.; No restriction gt 0. 

61 """ 

62 raise NotImplementedError # pragma: nocover 

63 

64 

65class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): 

66 def __init__(self) -> None: 

67 self._punctuation_count: int = 0 

68 self._symbol_count: int = 0 

69 self._character_count: int = 0 

70 

71 self._last_printable_char: Optional[str] = None 

72 self._frenzy_symbol_in_word: bool = False 

73 

74 def eligible(self, character: str) -> bool: 

75 return character.isprintable() 

76 

77 def feed(self, character: str) -> None: 

78 self._character_count += 1 

79 

80 if ( 

81 character != self._last_printable_char 

82 and character not in COMMON_SAFE_ASCII_CHARACTERS 

83 ): 

84 if is_punctuation(character): 

85 self._punctuation_count += 1 

86 elif ( 

87 character.isdigit() is False 

88 and is_symbol(character) 

89 and is_emoticon(character) is False 

90 ): 

91 self._symbol_count += 2 

92 

93 self._last_printable_char = character 

94 

95 def reset(self) -> None: # pragma: no cover 

96 self._punctuation_count = 0 

97 self._character_count = 0 

98 self._symbol_count = 0 

99 

100 @property 

101 def ratio(self) -> float: 

102 if self._character_count == 0: 

103 return 0.0 

104 

105 ratio_of_punctuation: float = ( 

106 self._punctuation_count + self._symbol_count 

107 ) / self._character_count 

108 

109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 

110 

111 

112class TooManyAccentuatedPlugin(MessDetectorPlugin): 

113 def __init__(self) -> None: 

114 self._character_count: int = 0 

115 self._accentuated_count: int = 0 

116 

117 def eligible(self, character: str) -> bool: 

118 return character.isalpha() 

119 

120 def feed(self, character: str) -> None: 

121 self._character_count += 1 

122 

123 if is_accentuated(character): 

124 self._accentuated_count += 1 

125 

126 def reset(self) -> None: # pragma: no cover 

127 self._character_count = 0 

128 self._accentuated_count = 0 

129 

130 @property 

131 def ratio(self) -> float: 

132 if self._character_count < 8: 

133 return 0.0 

134 

135 ratio_of_accentuation: float = self._accentuated_count / self._character_count 

136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 

137 

138 

139class UnprintablePlugin(MessDetectorPlugin): 

140 def __init__(self) -> None: 

141 self._unprintable_count: int = 0 

142 self._character_count: int = 0 

143 

144 def eligible(self, character: str) -> bool: 

145 return True 

146 

147 def feed(self, character: str) -> None: 

148 if is_unprintable(character): 

149 self._unprintable_count += 1 

150 self._character_count += 1 

151 

152 def reset(self) -> None: # pragma: no cover 

153 self._unprintable_count = 0 

154 

155 @property 

156 def ratio(self) -> float: 

157 if self._character_count == 0: 

158 return 0.0 

159 

160 return (self._unprintable_count * 8) / self._character_count 

161 

162 

163class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): 

164 def __init__(self) -> None: 

165 self._successive_count: int = 0 

166 self._character_count: int = 0 

167 

168 self._last_latin_character: Optional[str] = None 

169 

170 def eligible(self, character: str) -> bool: 

171 return character.isalpha() and is_latin(character) 

172 

173 def feed(self, character: str) -> None: 

174 self._character_count += 1 

175 if ( 

176 self._last_latin_character is not None 

177 and is_accentuated(character) 

178 and is_accentuated(self._last_latin_character) 

179 ): 

180 if character.isupper() and self._last_latin_character.isupper(): 

181 self._successive_count += 1 

182 # Worse if its the same char duplicated with different accent. 

183 if remove_accent(character) == remove_accent(self._last_latin_character): 

184 self._successive_count += 1 

185 self._last_latin_character = character 

186 

187 def reset(self) -> None: # pragma: no cover 

188 self._successive_count = 0 

189 self._character_count = 0 

190 self._last_latin_character = None 

191 

192 @property 

193 def ratio(self) -> float: 

194 if self._character_count == 0: 

195 return 0.0 

196 

197 return (self._successive_count * 2) / self._character_count 

198 

199 

200class SuspiciousRange(MessDetectorPlugin): 

201 def __init__(self) -> None: 

202 self._suspicious_successive_range_count: int = 0 

203 self._character_count: int = 0 

204 self._last_printable_seen: Optional[str] = None 

205 

206 def eligible(self, character: str) -> bool: 

207 return character.isprintable() 

208 

209 def feed(self, character: str) -> None: 

210 self._character_count += 1 

211 

212 if ( 

213 character.isspace() 

214 or is_punctuation(character) 

215 or character in COMMON_SAFE_ASCII_CHARACTERS 

216 ): 

217 self._last_printable_seen = None 

218 return 

219 

220 if self._last_printable_seen is None: 

221 self._last_printable_seen = character 

222 return 

223 

224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) 

225 unicode_range_b: Optional[str] = unicode_range(character) 

226 

227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): 

228 self._suspicious_successive_range_count += 1 

229 

230 self._last_printable_seen = character 

231 

232 def reset(self) -> None: # pragma: no cover 

233 self._character_count = 0 

234 self._suspicious_successive_range_count = 0 

235 self._last_printable_seen = None 

236 

237 @property 

238 def ratio(self) -> float: 

239 if self._character_count <= 24: 

240 return 0.0 

241 

242 ratio_of_suspicious_range_usage: float = ( 

243 self._suspicious_successive_range_count * 2 

244 ) / self._character_count 

245 

246 return ratio_of_suspicious_range_usage 

247 

248 

249class SuperWeirdWordPlugin(MessDetectorPlugin): 

250 def __init__(self) -> None: 

251 self._word_count: int = 0 

252 self._bad_word_count: int = 0 

253 self._foreign_long_count: int = 0 

254 

255 self._is_current_word_bad: bool = False 

256 self._foreign_long_watch: bool = False 

257 

258 self._character_count: int = 0 

259 self._bad_character_count: int = 0 

260 

261 self._buffer: str = "" 

262 self._buffer_accent_count: int = 0 

263 

264 def eligible(self, character: str) -> bool: 

265 return True 

266 

267 def feed(self, character: str) -> None: 

268 if character.isalpha(): 

269 self._buffer += character 

270 if is_accentuated(character): 

271 self._buffer_accent_count += 1 

272 if ( 

273 self._foreign_long_watch is False 

274 and (is_latin(character) is False or is_accentuated(character)) 

275 and is_cjk(character) is False 

276 and is_hangul(character) is False 

277 and is_katakana(character) is False 

278 and is_hiragana(character) is False 

279 and is_thai(character) is False 

280 ): 

281 self._foreign_long_watch = True 

282 return 

283 if not self._buffer: 

284 return 

285 if ( 

286 character.isspace() or is_punctuation(character) or is_separator(character) 

287 ) and self._buffer: 

288 self._word_count += 1 

289 buffer_length: int = len(self._buffer) 

290 

291 self._character_count += buffer_length 

292 

293 if buffer_length >= 4: 

294 if self._buffer_accent_count / buffer_length > 0.34: 

295 self._is_current_word_bad = True 

296 # Word/Buffer ending with an upper case accentuated letter are so rare, 

297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. 

298 if ( 

299 is_accentuated(self._buffer[-1]) 

300 and self._buffer[-1].isupper() 

301 and all(_.isupper() for _ in self._buffer) is False 

302 ): 

303 self._foreign_long_count += 1 

304 self._is_current_word_bad = True 

305 if buffer_length >= 24 and self._foreign_long_watch: 

306 camel_case_dst = [ 

307 i 

308 for c, i in zip(self._buffer, range(0, buffer_length)) 

309 if c.isupper() 

310 ] 

311 probable_camel_cased: bool = False 

312 

313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): 

314 probable_camel_cased = True 

315 

316 if not probable_camel_cased: 

317 self._foreign_long_count += 1 

318 self._is_current_word_bad = True 

319 

320 if self._is_current_word_bad: 

321 self._bad_word_count += 1 

322 self._bad_character_count += len(self._buffer) 

323 self._is_current_word_bad = False 

324 

325 self._foreign_long_watch = False 

326 self._buffer = "" 

327 self._buffer_accent_count = 0 

328 elif ( 

329 character not in {"<", ">", "-", "=", "~", "|", "_"} 

330 and character.isdigit() is False 

331 and is_symbol(character) 

332 ): 

333 self._is_current_word_bad = True 

334 self._buffer += character 

335 

336 def reset(self) -> None: # pragma: no cover 

337 self._buffer = "" 

338 self._is_current_word_bad = False 

339 self._foreign_long_watch = False 

340 self._bad_word_count = 0 

341 self._word_count = 0 

342 self._character_count = 0 

343 self._bad_character_count = 0 

344 self._foreign_long_count = 0 

345 

346 @property 

347 def ratio(self) -> float: 

348 if self._word_count <= 10 and self._foreign_long_count == 0: 

349 return 0.0 

350 

351 return self._bad_character_count / self._character_count 

352 

353 

354class CjkInvalidStopPlugin(MessDetectorPlugin): 

355 """ 

356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and 

357 can be easily detected. Searching for the overuse of '丅' and '丄'. 

358 """ 

359 

360 def __init__(self) -> None: 

361 self._wrong_stop_count: int = 0 

362 self._cjk_character_count: int = 0 

363 

364 def eligible(self, character: str) -> bool: 

365 return True 

366 

367 def feed(self, character: str) -> None: 

368 if character in {"丅", "丄"}: 

369 self._wrong_stop_count += 1 

370 return 

371 if is_cjk(character): 

372 self._cjk_character_count += 1 

373 

374 def reset(self) -> None: # pragma: no cover 

375 self._wrong_stop_count = 0 

376 self._cjk_character_count = 0 

377 

378 @property 

379 def ratio(self) -> float: 

380 if self._cjk_character_count < 16: 

381 return 0.0 

382 return self._wrong_stop_count / self._cjk_character_count 

383 

384 

385class ArchaicUpperLowerPlugin(MessDetectorPlugin): 

386 def __init__(self) -> None: 

387 self._buf: bool = False 

388 

389 self._character_count_since_last_sep: int = 0 

390 

391 self._successive_upper_lower_count: int = 0 

392 self._successive_upper_lower_count_final: int = 0 

393 

394 self._character_count: int = 0 

395 

396 self._last_alpha_seen: Optional[str] = None 

397 self._current_ascii_only: bool = True 

398 

399 def eligible(self, character: str) -> bool: 

400 return True 

401 

402 def feed(self, character: str) -> None: 

403 is_concerned = character.isalpha() and is_case_variable(character) 

404 chunk_sep = is_concerned is False 

405 

406 if chunk_sep and self._character_count_since_last_sep > 0: 

407 if ( 

408 self._character_count_since_last_sep <= 64 

409 and character.isdigit() is False 

410 and self._current_ascii_only is False 

411 ): 

412 self._successive_upper_lower_count_final += ( 

413 self._successive_upper_lower_count 

414 ) 

415 

416 self._successive_upper_lower_count = 0 

417 self._character_count_since_last_sep = 0 

418 self._last_alpha_seen = None 

419 self._buf = False 

420 self._character_count += 1 

421 self._current_ascii_only = True 

422 

423 return 

424 

425 if self._current_ascii_only is True and character.isascii() is False: 

426 self._current_ascii_only = False 

427 

428 if self._last_alpha_seen is not None: 

429 if (character.isupper() and self._last_alpha_seen.islower()) or ( 

430 character.islower() and self._last_alpha_seen.isupper() 

431 ): 

432 if self._buf is True: 

433 self._successive_upper_lower_count += 2 

434 self._buf = False 

435 else: 

436 self._buf = True 

437 else: 

438 self._buf = False 

439 

440 self._character_count += 1 

441 self._character_count_since_last_sep += 1 

442 self._last_alpha_seen = character 

443 

444 def reset(self) -> None: # pragma: no cover 

445 self._character_count = 0 

446 self._character_count_since_last_sep = 0 

447 self._successive_upper_lower_count = 0 

448 self._successive_upper_lower_count_final = 0 

449 self._last_alpha_seen = None 

450 self._buf = False 

451 self._current_ascii_only = True 

452 

453 @property 

454 def ratio(self) -> float: 

455 if self._character_count == 0: 

456 return 0.0 

457 

458 return self._successive_upper_lower_count_final / self._character_count 

459 

460 

461class ArabicIsolatedFormPlugin(MessDetectorPlugin): 

462 def __init__(self) -> None: 

463 self._character_count: int = 0 

464 self._isolated_form_count: int = 0 

465 

466 def reset(self) -> None: # pragma: no cover 

467 self._character_count = 0 

468 self._isolated_form_count = 0 

469 

470 def eligible(self, character: str) -> bool: 

471 return is_arabic(character) 

472 

473 def feed(self, character: str) -> None: 

474 self._character_count += 1 

475 

476 if is_arabic_isolated_form(character): 

477 self._isolated_form_count += 1 

478 

479 @property 

480 def ratio(self) -> float: 

481 if self._character_count < 8: 

482 return 0.0 

483 

484 isolated_form_usage: float = self._isolated_form_count / self._character_count 

485 

486 return isolated_form_usage 

487 

488 

489@lru_cache(maxsize=1024) 

490def is_suspiciously_successive_range( 

491 unicode_range_a: Optional[str], unicode_range_b: Optional[str] 

492) -> bool: 

493 """ 

494 Determine if two Unicode range seen next to each other can be considered as suspicious. 

495 """ 

496 if unicode_range_a is None or unicode_range_b is None: 

497 return True 

498 

499 if unicode_range_a == unicode_range_b: 

500 return False 

501 

502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: 

503 return False 

504 

505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: 

506 return False 

507 

508 # Latin characters can be accompanied with a combining diacritical mark 

509 # eg. Vietnamese. 

510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( 

511 "Combining" in unicode_range_a or "Combining" in unicode_range_b 

512 ): 

513 return False 

514 

515 keywords_range_a, keywords_range_b = unicode_range_a.split( 

516 " " 

517 ), unicode_range_b.split(" ") 

518 

519 for el in keywords_range_a: 

520 if el in UNICODE_SECONDARY_RANGE_KEYWORD: 

521 continue 

522 if el in keywords_range_b: 

523 return False 

524 

525 # Japanese Exception 

526 range_a_jp_chars, range_b_jp_chars = ( 

527 unicode_range_a 

528 in ( 

529 "Hiragana", 

530 "Katakana", 

531 ), 

532 unicode_range_b in ("Hiragana", "Katakana"), 

533 ) 

534 if (range_a_jp_chars or range_b_jp_chars) and ( 

535 "CJK" in unicode_range_a or "CJK" in unicode_range_b 

536 ): 

537 return False 

538 if range_a_jp_chars and range_b_jp_chars: 

539 return False 

540 

541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: 

542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: 

543 return False 

544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

545 return False 

546 

547 # Chinese/Japanese use dedicated range for punctuation and/or separators. 

548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( 

549 unicode_range_a in ["Katakana", "Hiragana"] 

550 and unicode_range_b in ["Katakana", "Hiragana"] 

551 ): 

552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: 

553 return False 

554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: 

555 return False 

556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

557 return False 

558 

559 return True 

560 

561 

562@lru_cache(maxsize=2048) 

563def mess_ratio( 

564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False 

565) -> float: 

566 """ 

567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. 

568 """ 

569 

570 detectors: List[MessDetectorPlugin] = [ 

571 md_class() for md_class in MessDetectorPlugin.__subclasses__() 

572 ] 

573 

574 length: int = len(decoded_sequence) + 1 

575 

576 mean_mess_ratio: float = 0.0 

577 

578 if length < 512: 

579 intermediary_mean_mess_ratio_calc: int = 32 

580 elif length <= 1024: 

581 intermediary_mean_mess_ratio_calc = 64 

582 else: 

583 intermediary_mean_mess_ratio_calc = 128 

584 

585 for character, index in zip(decoded_sequence + "\n", range(length)): 

586 for detector in detectors: 

587 if detector.eligible(character): 

588 detector.feed(character) 

589 

590 if ( 

591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0 

592 ) or index == length - 1: 

593 mean_mess_ratio = sum(dt.ratio for dt in detectors) 

594 

595 if mean_mess_ratio >= maximum_threshold: 

596 break 

597 

598 if debug: 

599 logger = getLogger("charset_normalizer") 

600 

601 logger.log( 

602 TRACE, 

603 "Mess-detector extended-analysis start. " 

604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " 

605 f"maximum_threshold={maximum_threshold}", 

606 ) 

607 

608 if len(decoded_sequence) > 16: 

609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") 

610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") 

611 

612 for dt in detectors: # pragma: nocover 

613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") 

614 

615 return round(mean_mess_ratio, 3)