Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset

1from functools import lru_cache

2from logging import getLogger

3from typing import List, Optional

5from .constant import (

6 COMMON_SAFE_ASCII_CHARACTERS,

7 TRACE,

8 UNICODE_SECONDARY_RANGE_KEYWORD,

10from .utils import (

11 is_accentuated,

12 is_ascii,

13 is_case_variable,

14 is_cjk,

15 is_emoticon,

16 is_hangul,

17 is_hiragana,

18 is_katakana,

19 is_latin,

20 is_punctuation,

21 is_separator,

22 is_symbol,

23 is_thai,

24 is_unprintable,

25 remove_accent,

26 unicode_range,

27)

30class MessDetectorPlugin:

31 """

32 Base abstract class used for mess detection plugins.

33 All detectors MUST extend and implement given methods.

34 """

36 def eligible(self, character: str) -> bool:

37 """

38 Determine if given character should be fed in.

39 """

40 raise NotImplementedError # pragma: nocover

42 def feed(self, character: str) -> None:

43 """

44 The main routine to be executed upon character.

45 Insert the logic in witch the text would be considered chaotic.

46 """

47 raise NotImplementedError # pragma: nocover

49 def reset(self) -> None: # pragma: no cover

50 """

51 Permit to reset the plugin to the initial state.

52 """

53 raise NotImplementedError

55 @property

56 def ratio(self) -> float:

57 """

58 Compute the chaos ratio based on what your feed() has seen.

59 Must NOT be lower than 0.; No restriction gt 0.

60 """

61 raise NotImplementedError # pragma: nocover

64class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

65 def __init__(self) -> None:

66 self._punctuation_count: int = 0

67 self._symbol_count: int = 0

68 self._character_count: int = 0

70 self._last_printable_char: Optional[str] = None

71 self._frenzy_symbol_in_word: bool = False

73 def eligible(self, character: str) -> bool:

74 return character.isprintable()

76 def feed(self, character: str) -> None:

77 self._character_count += 1

79 if (

80 character != self._last_printable_char

81 and character not in COMMON_SAFE_ASCII_CHARACTERS

82 ):

83 if is_punctuation(character):

84 self._punctuation_count += 1

85 elif (

86 character.isdigit() is False

87 and is_symbol(character)

88 and is_emoticon(character) is False

89 ):

90 self._symbol_count += 2

92 self._last_printable_char = character

94 def reset(self) -> None: # pragma: no cover

95 self._punctuation_count = 0

96 self._character_count = 0

97 self._symbol_count = 0

99 @property

100 def ratio(self) -> float:

101 if self._character_count == 0:

102 return 0.0

103

104 ratio_of_punctuation: float = (

105 self._punctuation_count + self._symbol_count

106 ) / self._character_count

107

108 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

109

110

111class TooManyAccentuatedPlugin(MessDetectorPlugin):

112 def __init__(self) -> None:

113 self._character_count: int = 0

114 self._accentuated_count: int = 0

115

116 def eligible(self, character: str) -> bool:

117 return character.isalpha()

118

119 def feed(self, character: str) -> None:

120 self._character_count += 1

121

122 if is_accentuated(character):

123 self._accentuated_count += 1

124

125 def reset(self) -> None: # pragma: no cover

126 self._character_count = 0

127 self._accentuated_count = 0

128

129 @property

130 def ratio(self) -> float:

131 if self._character_count == 0 or self._character_count < 8:

132 return 0.0

133 ratio_of_accentuation: float = self._accentuated_count / self._character_count

134 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

135

136

137class UnprintablePlugin(MessDetectorPlugin):

138 def __init__(self) -> None:

139 self._unprintable_count: int = 0

140 self._character_count: int = 0

141

142 def eligible(self, character: str) -> bool:

143 return True

144

145 def feed(self, character: str) -> None:

146 if is_unprintable(character):

147 self._unprintable_count += 1

148 self._character_count += 1

149

150 def reset(self) -> None: # pragma: no cover

151 self._unprintable_count = 0

152

153 @property

154 def ratio(self) -> float:

155 if self._character_count == 0:

156 return 0.0

157

158 return (self._unprintable_count * 8) / self._character_count

159

160

161class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

162 def __init__(self) -> None:

163 self._successive_count: int = 0

164 self._character_count: int = 0

165

166 self._last_latin_character: Optional[str] = None

167

168 def eligible(self, character: str) -> bool:

169 return character.isalpha() and is_latin(character)

170

171 def feed(self, character: str) -> None:

172 self._character_count += 1

173 if (

174 self._last_latin_character is not None

175 and is_accentuated(character)

176 and is_accentuated(self._last_latin_character)

177 ):

178 if character.isupper() and self._last_latin_character.isupper():

179 self._successive_count += 1

180 # Worse if its the same char duplicated with different accent.

181 if remove_accent(character) == remove_accent(self._last_latin_character):

182 self._successive_count += 1

183 self._last_latin_character = character

184

185 def reset(self) -> None: # pragma: no cover

186 self._successive_count = 0

187 self._character_count = 0

188 self._last_latin_character = None

189

190 @property

191 def ratio(self) -> float:

192 if self._character_count == 0:

193 return 0.0

194

195 return (self._successive_count * 2) / self._character_count

196

197

198class SuspiciousRange(MessDetectorPlugin):

199 def __init__(self) -> None:

200 self._suspicious_successive_range_count: int = 0

201 self._character_count: int = 0

202 self._last_printable_seen: Optional[str] = None

203

204 def eligible(self, character: str) -> bool:

205 return character.isprintable()

206

207 def feed(self, character: str) -> None:

208 self._character_count += 1

209

210 if (

211 character.isspace()

212 or is_punctuation(character)

213 or character in COMMON_SAFE_ASCII_CHARACTERS

214 ):

215 self._last_printable_seen = None

216 return

217

218 if self._last_printable_seen is None:

219 self._last_printable_seen = character

220 return

221

222 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)

223 unicode_range_b: Optional[str] = unicode_range(character)

224

225 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):

226 self._suspicious_successive_range_count += 1

227

228 self._last_printable_seen = character

229

230 def reset(self) -> None: # pragma: no cover

231 self._character_count = 0

232 self._suspicious_successive_range_count = 0

233 self._last_printable_seen = None

234

235 @property

236 def ratio(self) -> float:

237 if self._character_count == 0:

238 return 0.0

239

240 ratio_of_suspicious_range_usage: float = (

241 self._suspicious_successive_range_count * 2

242 ) / self._character_count

243

244 if ratio_of_suspicious_range_usage < 0.1:

245 return 0.0

246

247 return ratio_of_suspicious_range_usage

248

249

250class SuperWeirdWordPlugin(MessDetectorPlugin):

251 def __init__(self) -> None:

252 self._word_count: int = 0

253 self._bad_word_count: int = 0

254 self._foreign_long_count: int = 0

255

256 self._is_current_word_bad: bool = False

257 self._foreign_long_watch: bool = False

258

259 self._character_count: int = 0

260 self._bad_character_count: int = 0

261

262 self._buffer: str = ""

263 self._buffer_accent_count: int = 0

264

265 def eligible(self, character: str) -> bool:

266 return True

267

268 def feed(self, character: str) -> None:

269 if character.isalpha():

270 self._buffer += character

271 if is_accentuated(character):

272 self._buffer_accent_count += 1

273 if (

274 self._foreign_long_watch is False

275 and (is_latin(character) is False or is_accentuated(character))

276 and is_cjk(character) is False

277 and is_hangul(character) is False

278 and is_katakana(character) is False

279 and is_hiragana(character) is False

280 and is_thai(character) is False

281 ):

282 self._foreign_long_watch = True

283 return

284 if not self._buffer:

285 return

286 if (

287 character.isspace() or is_punctuation(character) or is_separator(character)

288 ) and self._buffer:

289 self._word_count += 1

290 buffer_length: int = len(self._buffer)

291

292 self._character_count += buffer_length

293

294 if buffer_length >= 4:

295 if self._buffer_accent_count / buffer_length > 0.34:

296 self._is_current_word_bad = True

297 # Word/Buffer ending with a upper case accentuated letter are so rare,

298 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.

299 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():

300 self._foreign_long_count += 1

301 self._is_current_word_bad = True

302 if buffer_length >= 24 and self._foreign_long_watch:

303 self._foreign_long_count += 1

304 self._is_current_word_bad = True

305

306 if self._is_current_word_bad:

307 self._bad_word_count += 1

308 self._bad_character_count += len(self._buffer)

309 self._is_current_word_bad = False

310

311 self._foreign_long_watch = False

312 self._buffer = ""

313 self._buffer_accent_count = 0

314 elif (

315 character not in {"<", ">", "-", "=", "~", "|", "_"}

316 and character.isdigit() is False

317 and is_symbol(character)

318 ):

319 self._is_current_word_bad = True

320 self._buffer += character

321

322 def reset(self) -> None: # pragma: no cover

323 self._buffer = ""

324 self._is_current_word_bad = False

325 self._foreign_long_watch = False

326 self._bad_word_count = 0

327 self._word_count = 0

328 self._character_count = 0

329 self._bad_character_count = 0

330 self._foreign_long_count = 0

331

332 @property

333 def ratio(self) -> float:

334 if self._word_count <= 10 and self._foreign_long_count == 0:

335 return 0.0

336

337 return self._bad_character_count / self._character_count

338

339

340class CjkInvalidStopPlugin(MessDetectorPlugin):

341 """

342 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and

343 can be easily detected. Searching for the overuse of '丅' and '丄'.

344 """

345

346 def __init__(self) -> None:

347 self._wrong_stop_count: int = 0

348 self._cjk_character_count: int = 0

349

350 def eligible(self, character: str) -> bool:

351 return True

352

353 def feed(self, character: str) -> None:

354 if character in {"丅", "丄"}:

355 self._wrong_stop_count += 1

356 return

357 if is_cjk(character):

358 self._cjk_character_count += 1

359

360 def reset(self) -> None: # pragma: no cover

361 self._wrong_stop_count = 0

362 self._cjk_character_count = 0

363

364 @property

365 def ratio(self) -> float:

366 if self._cjk_character_count < 16:

367 return 0.0

368 return self._wrong_stop_count / self._cjk_character_count

369

370

371class ArchaicUpperLowerPlugin(MessDetectorPlugin):

372 def __init__(self) -> None:

373 self._buf: bool = False

374

375 self._character_count_since_last_sep: int = 0

376

377 self._successive_upper_lower_count: int = 0

378 self._successive_upper_lower_count_final: int = 0

379

380 self._character_count: int = 0

381

382 self._last_alpha_seen: Optional[str] = None

383 self._current_ascii_only: bool = True

384

385 def eligible(self, character: str) -> bool:

386 return True

387

388 def feed(self, character: str) -> None:

389 is_concerned = character.isalpha() and is_case_variable(character)

390 chunk_sep = is_concerned is False

391

392 if chunk_sep and self._character_count_since_last_sep > 0:

393 if (

394 self._character_count_since_last_sep <= 64

395 and character.isdigit() is False

396 and self._current_ascii_only is False

397 ):

398 self._successive_upper_lower_count_final += (

399 self._successive_upper_lower_count

400 )

401

402 self._successive_upper_lower_count = 0

403 self._character_count_since_last_sep = 0

404 self._last_alpha_seen = None

405 self._buf = False

406 self._character_count += 1

407 self._current_ascii_only = True

408

409 return

410

411 if self._current_ascii_only is True and is_ascii(character) is False:

412 self._current_ascii_only = False

413

414 if self._last_alpha_seen is not None:

415 if (character.isupper() and self._last_alpha_seen.islower()) or (

416 character.islower() and self._last_alpha_seen.isupper()

417 ):

418 if self._buf is True:

419 self._successive_upper_lower_count += 2

420 self._buf = False

421 else:

422 self._buf = True

423 else:

424 self._buf = False

425

426 self._character_count += 1

427 self._character_count_since_last_sep += 1

428 self._last_alpha_seen = character

429

430 def reset(self) -> None: # pragma: no cover

431 self._character_count = 0

432 self._character_count_since_last_sep = 0

433 self._successive_upper_lower_count = 0

434 self._successive_upper_lower_count_final = 0

435 self._last_alpha_seen = None

436 self._buf = False

437 self._current_ascii_only = True

438

439 @property

440 def ratio(self) -> float:

441 if self._character_count == 0:

442 return 0.0

443

444 return self._successive_upper_lower_count_final / self._character_count

445

446

447@lru_cache(maxsize=1024)

448def is_suspiciously_successive_range(

449 unicode_range_a: Optional[str], unicode_range_b: Optional[str]

450) -> bool:

451 """

452 Determine if two Unicode range seen next to each other can be considered as suspicious.

453 """

454 if unicode_range_a is None or unicode_range_b is None:

455 return True

456

457 if unicode_range_a == unicode_range_b:

458 return False

459

460 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:

461 return False

462

463 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:

464 return False

465

466 # Latin characters can be accompanied with a combining diacritical mark

467 # eg. Vietnamese.

468 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (

469 "Combining" in unicode_range_a or "Combining" in unicode_range_b

470 ):

471 return False

472

473 keywords_range_a, keywords_range_b = unicode_range_a.split(

474 " "

475 ), unicode_range_b.split(" ")

476

477 for el in keywords_range_a:

478 if el in UNICODE_SECONDARY_RANGE_KEYWORD:

479 continue

480 if el in keywords_range_b:

481 return False

482

483 # Japanese Exception

484 range_a_jp_chars, range_b_jp_chars = (

485 unicode_range_a

486 in (

487 "Hiragana",

488 "Katakana",

489 ),

490 unicode_range_b in ("Hiragana", "Katakana"),

491 )

492 if (range_a_jp_chars or range_b_jp_chars) and (

493 "CJK" in unicode_range_a or "CJK" in unicode_range_b

494 ):

495 return False

496 if range_a_jp_chars and range_b_jp_chars:

497 return False

498

499 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:

500 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:

501 return False

502 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

503 return False

504

505 # Chinese/Japanese use dedicated range for punctuation and/or separators.

506 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (

507 unicode_range_a in ["Katakana", "Hiragana"]

508 and unicode_range_b in ["Katakana", "Hiragana"]

509 ):

510 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:

511 return False

512 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:

513 return False

514

515 return True

516

517

518@lru_cache(maxsize=2048)

519def mess_ratio(

520 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False

521) -> float:

522 """

523 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.

524 """

525

526 detectors: List[MessDetectorPlugin] = [

527 md_class() for md_class in MessDetectorPlugin.__subclasses__()

528 ]

529

530 length: int = len(decoded_sequence) + 1

531

532 mean_mess_ratio: float = 0.0

533

534 if length < 512:

535 intermediary_mean_mess_ratio_calc: int = 32

536 elif length <= 1024:

537 intermediary_mean_mess_ratio_calc = 64

538 else:

539 intermediary_mean_mess_ratio_calc = 128

540

541 for character, index in zip(decoded_sequence + "\n", range(length)):

542 for detector in detectors:

543 if detector.eligible(character):

544 detector.feed(character)

545

546 if (

547 index > 0 and index % intermediary_mean_mess_ratio_calc == 0

548 ) or index == length - 1:

549 mean_mess_ratio = sum(dt.ratio for dt in detectors)

550

551 if mean_mess_ratio >= maximum_threshold:

552 break

553

554 if debug:

555 logger = getLogger("charset_normalizer")

556

557 logger.log(

558 TRACE,

559 "Mess-detector extended-analysis start. "

560 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "

561 f"maximum_threshold={maximum_threshold}",

562 )

563

564 if len(decoded_sequence) > 16:

565 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")

566 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

567

568 for dt in detectors: # pragma: nocover

569 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

570

571 return round(mean_mess_ratio, 3)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/md.py: 22%

280 statements