Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/md.py: 22%

285 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:22 +0000

1from functools import lru_cache 

2from logging import getLogger 

3from typing import List, Optional 

4 

5from .constant import ( 

6 COMMON_SAFE_ASCII_CHARACTERS, 

7 TRACE, 

8 UNICODE_SECONDARY_RANGE_KEYWORD, 

9) 

10from .utils import ( 

11 is_accentuated, 

12 is_ascii, 

13 is_case_variable, 

14 is_cjk, 

15 is_emoticon, 

16 is_hangul, 

17 is_hiragana, 

18 is_katakana, 

19 is_latin, 

20 is_punctuation, 

21 is_separator, 

22 is_symbol, 

23 is_thai, 

24 is_unprintable, 

25 remove_accent, 

26 unicode_range, 

27) 

28 

29 

30class MessDetectorPlugin: 

31 """ 

32 Base abstract class used for mess detection plugins. 

33 All detectors MUST extend and implement given methods. 

34 """ 

35 

36 def eligible(self, character: str) -> bool: 

37 """ 

38 Determine if given character should be fed in. 

39 """ 

40 raise NotImplementedError # pragma: nocover 

41 

42 def feed(self, character: str) -> None: 

43 """ 

44 The main routine to be executed upon character. 

45 Insert the logic in witch the text would be considered chaotic. 

46 """ 

47 raise NotImplementedError # pragma: nocover 

48 

49 def reset(self) -> None: # pragma: no cover 

50 """ 

51 Permit to reset the plugin to the initial state. 

52 """ 

53 raise NotImplementedError 

54 

55 @property 

56 def ratio(self) -> float: 

57 """ 

58 Compute the chaos ratio based on what your feed() has seen. 

59 Must NOT be lower than 0.; No restriction gt 0. 

60 """ 

61 raise NotImplementedError # pragma: nocover 

62 

63 

64class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): 

65 def __init__(self) -> None: 

66 self._punctuation_count: int = 0 

67 self._symbol_count: int = 0 

68 self._character_count: int = 0 

69 

70 self._last_printable_char: Optional[str] = None 

71 self._frenzy_symbol_in_word: bool = False 

72 

73 def eligible(self, character: str) -> bool: 

74 return character.isprintable() 

75 

76 def feed(self, character: str) -> None: 

77 self._character_count += 1 

78 

79 if ( 

80 character != self._last_printable_char 

81 and character not in COMMON_SAFE_ASCII_CHARACTERS 

82 ): 

83 if is_punctuation(character): 

84 self._punctuation_count += 1 

85 elif ( 

86 character.isdigit() is False 

87 and is_symbol(character) 

88 and is_emoticon(character) is False 

89 ): 

90 self._symbol_count += 2 

91 

92 self._last_printable_char = character 

93 

94 def reset(self) -> None: # pragma: no cover 

95 self._punctuation_count = 0 

96 self._character_count = 0 

97 self._symbol_count = 0 

98 

99 @property 

100 def ratio(self) -> float: 

101 if self._character_count == 0: 

102 return 0.0 

103 

104 ratio_of_punctuation: float = ( 

105 self._punctuation_count + self._symbol_count 

106 ) / self._character_count 

107 

108 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 

109 

110 

111class TooManyAccentuatedPlugin(MessDetectorPlugin): 

112 def __init__(self) -> None: 

113 self._character_count: int = 0 

114 self._accentuated_count: int = 0 

115 

116 def eligible(self, character: str) -> bool: 

117 return character.isalpha() 

118 

119 def feed(self, character: str) -> None: 

120 self._character_count += 1 

121 

122 if is_accentuated(character): 

123 self._accentuated_count += 1 

124 

125 def reset(self) -> None: # pragma: no cover 

126 self._character_count = 0 

127 self._accentuated_count = 0 

128 

129 @property 

130 def ratio(self) -> float: 

131 if self._character_count == 0 or self._character_count < 8: 

132 return 0.0 

133 ratio_of_accentuation: float = self._accentuated_count / self._character_count 

134 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 

135 

136 

137class UnprintablePlugin(MessDetectorPlugin): 

138 def __init__(self) -> None: 

139 self._unprintable_count: int = 0 

140 self._character_count: int = 0 

141 

142 def eligible(self, character: str) -> bool: 

143 return True 

144 

145 def feed(self, character: str) -> None: 

146 if is_unprintable(character): 

147 self._unprintable_count += 1 

148 self._character_count += 1 

149 

150 def reset(self) -> None: # pragma: no cover 

151 self._unprintable_count = 0 

152 

153 @property 

154 def ratio(self) -> float: 

155 if self._character_count == 0: 

156 return 0.0 

157 

158 return (self._unprintable_count * 8) / self._character_count 

159 

160 

161class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): 

162 def __init__(self) -> None: 

163 self._successive_count: int = 0 

164 self._character_count: int = 0 

165 

166 self._last_latin_character: Optional[str] = None 

167 

168 def eligible(self, character: str) -> bool: 

169 return character.isalpha() and is_latin(character) 

170 

171 def feed(self, character: str) -> None: 

172 self._character_count += 1 

173 if ( 

174 self._last_latin_character is not None 

175 and is_accentuated(character) 

176 and is_accentuated(self._last_latin_character) 

177 ): 

178 if character.isupper() and self._last_latin_character.isupper(): 

179 self._successive_count += 1 

180 # Worse if its the same char duplicated with different accent. 

181 if remove_accent(character) == remove_accent(self._last_latin_character): 

182 self._successive_count += 1 

183 self._last_latin_character = character 

184 

185 def reset(self) -> None: # pragma: no cover 

186 self._successive_count = 0 

187 self._character_count = 0 

188 self._last_latin_character = None 

189 

190 @property 

191 def ratio(self) -> float: 

192 if self._character_count == 0: 

193 return 0.0 

194 

195 return (self._successive_count * 2) / self._character_count 

196 

197 

198class SuspiciousRange(MessDetectorPlugin): 

199 def __init__(self) -> None: 

200 self._suspicious_successive_range_count: int = 0 

201 self._character_count: int = 0 

202 self._last_printable_seen: Optional[str] = None 

203 

204 def eligible(self, character: str) -> bool: 

205 return character.isprintable() 

206 

207 def feed(self, character: str) -> None: 

208 self._character_count += 1 

209 

210 if ( 

211 character.isspace() 

212 or is_punctuation(character) 

213 or character in COMMON_SAFE_ASCII_CHARACTERS 

214 ): 

215 self._last_printable_seen = None 

216 return 

217 

218 if self._last_printable_seen is None: 

219 self._last_printable_seen = character 

220 return 

221 

222 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) 

223 unicode_range_b: Optional[str] = unicode_range(character) 

224 

225 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): 

226 self._suspicious_successive_range_count += 1 

227 

228 self._last_printable_seen = character 

229 

230 def reset(self) -> None: # pragma: no cover 

231 self._character_count = 0 

232 self._suspicious_successive_range_count = 0 

233 self._last_printable_seen = None 

234 

235 @property 

236 def ratio(self) -> float: 

237 if self._character_count == 0: 

238 return 0.0 

239 

240 ratio_of_suspicious_range_usage: float = ( 

241 self._suspicious_successive_range_count * 2 

242 ) / self._character_count 

243 

244 if ratio_of_suspicious_range_usage < 0.1: 

245 return 0.0 

246 

247 return ratio_of_suspicious_range_usage 

248 

249 

250class SuperWeirdWordPlugin(MessDetectorPlugin): 

251 def __init__(self) -> None: 

252 self._word_count: int = 0 

253 self._bad_word_count: int = 0 

254 self._foreign_long_count: int = 0 

255 

256 self._is_current_word_bad: bool = False 

257 self._foreign_long_watch: bool = False 

258 

259 self._character_count: int = 0 

260 self._bad_character_count: int = 0 

261 

262 self._buffer: str = "" 

263 self._buffer_accent_count: int = 0 

264 

265 def eligible(self, character: str) -> bool: 

266 return True 

267 

268 def feed(self, character: str) -> None: 

269 if character.isalpha(): 

270 self._buffer += character 

271 if is_accentuated(character): 

272 self._buffer_accent_count += 1 

273 if ( 

274 self._foreign_long_watch is False 

275 and (is_latin(character) is False or is_accentuated(character)) 

276 and is_cjk(character) is False 

277 and is_hangul(character) is False 

278 and is_katakana(character) is False 

279 and is_hiragana(character) is False 

280 and is_thai(character) is False 

281 ): 

282 self._foreign_long_watch = True 

283 return 

284 if not self._buffer: 

285 return 

286 if ( 

287 character.isspace() or is_punctuation(character) or is_separator(character) 

288 ) and self._buffer: 

289 self._word_count += 1 

290 buffer_length: int = len(self._buffer) 

291 

292 self._character_count += buffer_length 

293 

294 if buffer_length >= 4: 

295 if self._buffer_accent_count / buffer_length > 0.34: 

296 self._is_current_word_bad = True 

297 # Word/Buffer ending with an upper case accentuated letter are so rare, 

298 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. 

299 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): 

300 self._foreign_long_count += 1 

301 self._is_current_word_bad = True 

302 if buffer_length >= 24 and self._foreign_long_watch: 

303 camel_case_dst = [ 

304 i 

305 for c, i in zip(self._buffer, range(0, buffer_length)) 

306 if c.isupper() 

307 ] 

308 probable_camel_cased: bool = False 

309 

310 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): 

311 probable_camel_cased = True 

312 

313 if not probable_camel_cased: 

314 self._foreign_long_count += 1 

315 self._is_current_word_bad = True 

316 

317 if self._is_current_word_bad: 

318 self._bad_word_count += 1 

319 self._bad_character_count += len(self._buffer) 

320 self._is_current_word_bad = False 

321 

322 self._foreign_long_watch = False 

323 self._buffer = "" 

324 self._buffer_accent_count = 0 

325 elif ( 

326 character not in {"<", ">", "-", "=", "~", "|", "_"} 

327 and character.isdigit() is False 

328 and is_symbol(character) 

329 ): 

330 self._is_current_word_bad = True 

331 self._buffer += character 

332 

333 def reset(self) -> None: # pragma: no cover 

334 self._buffer = "" 

335 self._is_current_word_bad = False 

336 self._foreign_long_watch = False 

337 self._bad_word_count = 0 

338 self._word_count = 0 

339 self._character_count = 0 

340 self._bad_character_count = 0 

341 self._foreign_long_count = 0 

342 

343 @property 

344 def ratio(self) -> float: 

345 if self._word_count <= 10 and self._foreign_long_count == 0: 

346 return 0.0 

347 

348 return self._bad_character_count / self._character_count 

349 

350 

351class CjkInvalidStopPlugin(MessDetectorPlugin): 

352 """ 

353 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and 

354 can be easily detected. Searching for the overuse of '丅' and '丄'. 

355 """ 

356 

357 def __init__(self) -> None: 

358 self._wrong_stop_count: int = 0 

359 self._cjk_character_count: int = 0 

360 

361 def eligible(self, character: str) -> bool: 

362 return True 

363 

364 def feed(self, character: str) -> None: 

365 if character in {"丅", "丄"}: 

366 self._wrong_stop_count += 1 

367 return 

368 if is_cjk(character): 

369 self._cjk_character_count += 1 

370 

371 def reset(self) -> None: # pragma: no cover 

372 self._wrong_stop_count = 0 

373 self._cjk_character_count = 0 

374 

375 @property 

376 def ratio(self) -> float: 

377 if self._cjk_character_count < 16: 

378 return 0.0 

379 return self._wrong_stop_count / self._cjk_character_count 

380 

381 

382class ArchaicUpperLowerPlugin(MessDetectorPlugin): 

383 def __init__(self) -> None: 

384 self._buf: bool = False 

385 

386 self._character_count_since_last_sep: int = 0 

387 

388 self._successive_upper_lower_count: int = 0 

389 self._successive_upper_lower_count_final: int = 0 

390 

391 self._character_count: int = 0 

392 

393 self._last_alpha_seen: Optional[str] = None 

394 self._current_ascii_only: bool = True 

395 

396 def eligible(self, character: str) -> bool: 

397 return True 

398 

399 def feed(self, character: str) -> None: 

400 is_concerned = character.isalpha() and is_case_variable(character) 

401 chunk_sep = is_concerned is False 

402 

403 if chunk_sep and self._character_count_since_last_sep > 0: 

404 if ( 

405 self._character_count_since_last_sep <= 64 

406 and character.isdigit() is False 

407 and self._current_ascii_only is False 

408 ): 

409 self._successive_upper_lower_count_final += ( 

410 self._successive_upper_lower_count 

411 ) 

412 

413 self._successive_upper_lower_count = 0 

414 self._character_count_since_last_sep = 0 

415 self._last_alpha_seen = None 

416 self._buf = False 

417 self._character_count += 1 

418 self._current_ascii_only = True 

419 

420 return 

421 

422 if self._current_ascii_only is True and is_ascii(character) is False: 

423 self._current_ascii_only = False 

424 

425 if self._last_alpha_seen is not None: 

426 if (character.isupper() and self._last_alpha_seen.islower()) or ( 

427 character.islower() and self._last_alpha_seen.isupper() 

428 ): 

429 if self._buf is True: 

430 self._successive_upper_lower_count += 2 

431 self._buf = False 

432 else: 

433 self._buf = True 

434 else: 

435 self._buf = False 

436 

437 self._character_count += 1 

438 self._character_count_since_last_sep += 1 

439 self._last_alpha_seen = character 

440 

441 def reset(self) -> None: # pragma: no cover 

442 self._character_count = 0 

443 self._character_count_since_last_sep = 0 

444 self._successive_upper_lower_count = 0 

445 self._successive_upper_lower_count_final = 0 

446 self._last_alpha_seen = None 

447 self._buf = False 

448 self._current_ascii_only = True 

449 

450 @property 

451 def ratio(self) -> float: 

452 if self._character_count == 0: 

453 return 0.0 

454 

455 return self._successive_upper_lower_count_final / self._character_count 

456 

457 

458@lru_cache(maxsize=1024) 

459def is_suspiciously_successive_range( 

460 unicode_range_a: Optional[str], unicode_range_b: Optional[str] 

461) -> bool: 

462 """ 

463 Determine if two Unicode range seen next to each other can be considered as suspicious. 

464 """ 

465 if unicode_range_a is None or unicode_range_b is None: 

466 return True 

467 

468 if unicode_range_a == unicode_range_b: 

469 return False 

470 

471 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: 

472 return False 

473 

474 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: 

475 return False 

476 

477 # Latin characters can be accompanied with a combining diacritical mark 

478 # eg. Vietnamese. 

479 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( 

480 "Combining" in unicode_range_a or "Combining" in unicode_range_b 

481 ): 

482 return False 

483 

484 keywords_range_a, keywords_range_b = unicode_range_a.split( 

485 " " 

486 ), unicode_range_b.split(" ") 

487 

488 for el in keywords_range_a: 

489 if el in UNICODE_SECONDARY_RANGE_KEYWORD: 

490 continue 

491 if el in keywords_range_b: 

492 return False 

493 

494 # Japanese Exception 

495 range_a_jp_chars, range_b_jp_chars = ( 

496 unicode_range_a 

497 in ( 

498 "Hiragana", 

499 "Katakana", 

500 ), 

501 unicode_range_b in ("Hiragana", "Katakana"), 

502 ) 

503 if (range_a_jp_chars or range_b_jp_chars) and ( 

504 "CJK" in unicode_range_a or "CJK" in unicode_range_b 

505 ): 

506 return False 

507 if range_a_jp_chars and range_b_jp_chars: 

508 return False 

509 

510 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: 

511 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: 

512 return False 

513 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": 

514 return False 

515 

516 # Chinese/Japanese use dedicated range for punctuation and/or separators. 

517 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( 

518 unicode_range_a in ["Katakana", "Hiragana"] 

519 and unicode_range_b in ["Katakana", "Hiragana"] 

520 ): 

521 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: 

522 return False 

523 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: 

524 return False 

525 

526 return True 

527 

528 

529@lru_cache(maxsize=2048) 

530def mess_ratio( 

531 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False 

532) -> float: 

533 """ 

534 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. 

535 """ 

536 

537 detectors: List[MessDetectorPlugin] = [ 

538 md_class() for md_class in MessDetectorPlugin.__subclasses__() 

539 ] 

540 

541 length: int = len(decoded_sequence) + 1 

542 

543 mean_mess_ratio: float = 0.0 

544 

545 if length < 512: 

546 intermediary_mean_mess_ratio_calc: int = 32 

547 elif length <= 1024: 

548 intermediary_mean_mess_ratio_calc = 64 

549 else: 

550 intermediary_mean_mess_ratio_calc = 128 

551 

552 for character, index in zip(decoded_sequence + "\n", range(length)): 

553 for detector in detectors: 

554 if detector.eligible(character): 

555 detector.feed(character) 

556 

557 if ( 

558 index > 0 and index % intermediary_mean_mess_ratio_calc == 0 

559 ) or index == length - 1: 

560 mean_mess_ratio = sum(dt.ratio for dt in detectors) 

561 

562 if mean_mess_ratio >= maximum_threshold: 

563 break 

564 

565 if debug: 

566 logger = getLogger("charset_normalizer") 

567 

568 logger.log( 

569 TRACE, 

570 "Mess-detector extended-analysis start. " 

571 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " 

572 f"maximum_threshold={maximum_threshold}", 

573 ) 

574 

575 if len(decoded_sequence) > 16: 

576 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") 

577 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") 

578 

579 for dt in detectors: # pragma: nocover 

580 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") 

581 

582 return round(mean_mess_ratio, 3)