Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset

1from functools import lru_cache

2from logging import getLogger

3from typing import List, Optional

5from .constant import (

6 COMMON_SAFE_ASCII_CHARACTERS,

7 TRACE,

8 UNICODE_SECONDARY_RANGE_KEYWORD,

10from .utils import (

11 is_accentuated,

12 is_arabic,

13 is_arabic_isolated_form,

14 is_case_variable,

15 is_cjk,

16 is_emoticon,

17 is_hangul,

18 is_hiragana,

19 is_katakana,

20 is_latin,

21 is_punctuation,

22 is_separator,

23 is_symbol,

24 is_thai,

25 is_unprintable,

26 remove_accent,

27 unicode_range,

28)

31class MessDetectorPlugin:

32 """

33 Base abstract class used for mess detection plugins.

34 All detectors MUST extend and implement given methods.

35 """

37 def eligible(self, character: str) -> bool:

38 """

39 Determine if given character should be fed in.

40 """

41 raise NotImplementedError # pragma: nocover

43 def feed(self, character: str) -> None:

44 """

45 The main routine to be executed upon character.

46 Insert the logic in witch the text would be considered chaotic.

47 """

48 raise NotImplementedError # pragma: nocover

50 def reset(self) -> None: # pragma: no cover

51 """

52 Permit to reset the plugin to the initial state.

53 """

54 raise NotImplementedError

56 @property

57 def ratio(self) -> float:

58 """

59 Compute the chaos ratio based on what your feed() has seen.

60 Must NOT be lower than 0.; No restriction gt 0.

61 """

62 raise NotImplementedError # pragma: nocover

65class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

66 def __init__(self) -> None:

67 self._punctuation_count: int = 0

68 self._symbol_count: int = 0

69 self._character_count: int = 0

71 self._last_printable_char: Optional[str] = None

72 self._frenzy_symbol_in_word: bool = False

74 def eligible(self, character: str) -> bool:

75 return character.isprintable()

77 def feed(self, character: str) -> None:

78 self._character_count += 1

80 if (

81 character != self._last_printable_char

82 and character not in COMMON_SAFE_ASCII_CHARACTERS

83 ):

84 if is_punctuation(character):

85 self._punctuation_count += 1

86 elif (

87 character.isdigit() is False

88 and is_symbol(character)

89 and is_emoticon(character) is False

90 ):

91 self._symbol_count += 2

93 self._last_printable_char = character

95 def reset(self) -> None: # pragma: no cover

96 self._punctuation_count = 0

97 self._character_count = 0

98 self._symbol_count = 0

100 @property

101 def ratio(self) -> float:

102 if self._character_count == 0:

103 return 0.0

104

105 ratio_of_punctuation: float = (

106 self._punctuation_count + self._symbol_count

107 ) / self._character_count

108

109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

110

111

112class TooManyAccentuatedPlugin(MessDetectorPlugin):

113 def __init__(self) -> None:

114 self._character_count: int = 0

115 self._accentuated_count: int = 0

116

117 def eligible(self, character: str) -> bool:

118 return character.isalpha()

119

120 def feed(self, character: str) -> None:

121 self._character_count += 1

122

123 if is_accentuated(character):

124 self._accentuated_count += 1

125

126 def reset(self) -> None: # pragma: no cover

127 self._character_count = 0

128 self._accentuated_count = 0

129

130 @property

131 def ratio(self) -> float:

132 if self._character_count < 8:

133 return 0.0

134

135 ratio_of_accentuation: float = self._accentuated_count / self._character_count

136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

137

138

139class UnprintablePlugin(MessDetectorPlugin):

140 def __init__(self) -> None:

141 self._unprintable_count: int = 0

142 self._character_count: int = 0

143

144 def eligible(self, character: str) -> bool:

145 return True

146

147 def feed(self, character: str) -> None:

148 if is_unprintable(character):

149 self._unprintable_count += 1

150 self._character_count += 1

151

152 def reset(self) -> None: # pragma: no cover

153 self._unprintable_count = 0

154

155 @property

156 def ratio(self) -> float:

157 if self._character_count == 0:

158 return 0.0

159

160 return (self._unprintable_count * 8) / self._character_count

161

162

163class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

164 def __init__(self) -> None:

165 self._successive_count: int = 0

166 self._character_count: int = 0

167

168 self._last_latin_character: Optional[str] = None

169

170 def eligible(self, character: str) -> bool:

171 return character.isalpha() and is_latin(character)

172

173 def feed(self, character: str) -> None:

174 self._character_count += 1

175 if (

176 self._last_latin_character is not None

177 and is_accentuated(character)

178 and is_accentuated(self._last_latin_character)

179 ):

180 if character.isupper() and self._last_latin_character.isupper():

181 self._successive_count += 1

182 # Worse if its the same char duplicated with different accent.

183 if remove_accent(character) == remove_accent(self._last_latin_character):

184 self._successive_count += 1

185 self._last_latin_character = character

186

187 def reset(self) -> None: # pragma: no cover

188 self._successive_count = 0

189 self._character_count = 0

190 self._last_latin_character = None

191

192 @property

193 def ratio(self) -> float:

194 if self._character_count == 0:

195 return 0.0

196

197 return (self._successive_count * 2) / self._character_count

198

199

200class SuspiciousRange(MessDetectorPlugin):

201 def __init__(self) -> None:

202 self._suspicious_successive_range_count: int = 0

203 self._character_count: int = 0

204 self._last_printable_seen: Optional[str] = None

205

206 def eligible(self, character: str) -> bool:

207 return character.isprintable()

208

209 def feed(self, character: str) -> None:

210 self._character_count += 1

211

212 if (

213 character.isspace()

214 or is_punctuation(character)

215 or character in COMMON_SAFE_ASCII_CHARACTERS

216 ):

217 self._last_printable_seen = None

218 return

219

220 if self._last_printable_seen is None:

221 self._last_printable_seen = character

222 return

223

224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)

225 unicode_range_b: Optional[str] = unicode_range(character)

226

227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):

228 self._suspicious_successive_range_count += 1

229

230 self._last_printable_seen = character

231

232 def reset(self) -> None: # pragma: no cover

233 self._character_count = 0

234 self._suspicious_successive_range_count = 0

235 self._last_printable_seen = None

236

237 @property

238 def ratio(self) -> float:

239 if self._character_count <= 24:

240 return 0.0

241

242 ratio_of_suspicious_range_usage: float = (

243 self._suspicious_successive_range_count * 2

244 ) / self._character_count

245

246 return ratio_of_suspicious_range_usage

247

248

249class SuperWeirdWordPlugin(MessDetectorPlugin):

250 def __init__(self) -> None:

251 self._word_count: int = 0

252 self._bad_word_count: int = 0

253 self._foreign_long_count: int = 0

254

255 self._is_current_word_bad: bool = False

256 self._foreign_long_watch: bool = False

257

258 self._character_count: int = 0

259 self._bad_character_count: int = 0

260

261 self._buffer: str = ""

262 self._buffer_accent_count: int = 0

263

264 def eligible(self, character: str) -> bool:

265 return True

266

267 def feed(self, character: str) -> None:

268 if character.isalpha():

269 self._buffer += character

270 if is_accentuated(character):

271 self._buffer_accent_count += 1

272 if (

273 self._foreign_long_watch is False

274 and (is_latin(character) is False or is_accentuated(character))

275 and is_cjk(character) is False

276 and is_hangul(character) is False

277 and is_katakana(character) is False

278 and is_hiragana(character) is False

279 and is_thai(character) is False

280 ):

281 self._foreign_long_watch = True

282 return

283 if not self._buffer:

284 return

285 if (

286 character.isspace() or is_punctuation(character) or is_separator(character)

287 ) and self._buffer:

288 self._word_count += 1

289 buffer_length: int = len(self._buffer)

290

291 self._character_count += buffer_length

292

293 if buffer_length >= 4:

294 if self._buffer_accent_count / buffer_length > 0.34:

295 self._is_current_word_bad = True

296 # Word/Buffer ending with an upper case accentuated letter are so rare,

297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.

298 if (

299 is_accentuated(self._buffer[-1])

300 and self._buffer[-1].isupper()

301 and all(_.isupper() for _ in self._buffer) is False

302 ):

303 self._foreign_long_count += 1

304 self._is_current_word_bad = True

305 if buffer_length >= 24 and self._foreign_long_watch:

306 camel_case_dst = [

307 i

308 for c, i in zip(self._buffer, range(0, buffer_length))

309 if c.isupper()

310 ]

311 probable_camel_cased: bool = False

312

313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):

314 probable_camel_cased = True

315

316 if not probable_camel_cased:

317 self._foreign_long_count += 1

318 self._is_current_word_bad = True

319

320 if self._is_current_word_bad:

321 self._bad_word_count += 1

322 self._bad_character_count += len(self._buffer)

323 self._is_current_word_bad = False

324

325 self._foreign_long_watch = False

326 self._buffer = ""

327 self._buffer_accent_count = 0

328 elif (

329 character not in {"<", ">", "-", "=", "~", "|", "_"}

330 and character.isdigit() is False

331 and is_symbol(character)

332 ):

333 self._is_current_word_bad = True

334 self._buffer += character

335

336 def reset(self) -> None: # pragma: no cover

337 self._buffer = ""

338 self._is_current_word_bad = False

339 self._foreign_long_watch = False

340 self._bad_word_count = 0

341 self._word_count = 0

342 self._character_count = 0

343 self._bad_character_count = 0

344 self._foreign_long_count = 0

345

346 @property

347 def ratio(self) -> float:

348 if self._word_count <= 10 and self._foreign_long_count == 0:

349 return 0.0

350

351 return self._bad_character_count / self._character_count

352

353

354class CjkInvalidStopPlugin(MessDetectorPlugin):

355 """

356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and

357 can be easily detected. Searching for the overuse of '丅' and '丄'.

358 """

359

360 def __init__(self) -> None:

361 self._wrong_stop_count: int = 0

362 self._cjk_character_count: int = 0

363

364 def eligible(self, character: str) -> bool:

365 return True

366

367 def feed(self, character: str) -> None:

368 if character in {"丅", "丄"}:

369 self._wrong_stop_count += 1

370 return

371 if is_cjk(character):

372 self._cjk_character_count += 1

373

374 def reset(self) -> None: # pragma: no cover

375 self._wrong_stop_count = 0

376 self._cjk_character_count = 0

377

378 @property

379 def ratio(self) -> float:

380 if self._cjk_character_count < 16:

381 return 0.0

382 return self._wrong_stop_count / self._cjk_character_count

383

384

385class ArchaicUpperLowerPlugin(MessDetectorPlugin):

386 def __init__(self) -> None:

387 self._buf: bool = False

388

389 self._character_count_since_last_sep: int = 0

390

391 self._successive_upper_lower_count: int = 0

392 self._successive_upper_lower_count_final: int = 0

393

394 self._character_count: int = 0

395

396 self._last_alpha_seen: Optional[str] = None

397 self._current_ascii_only: bool = True

398

399 def eligible(self, character: str) -> bool:

400 return True

401

402 def feed(self, character: str) -> None:

403 is_concerned = character.isalpha() and is_case_variable(character)

404 chunk_sep = is_concerned is False

405

406 if chunk_sep and self._character_count_since_last_sep > 0:

407 if (

408 self._character_count_since_last_sep <= 64

409 and character.isdigit() is False

410 and self._current_ascii_only is False

411 ):

412 self._successive_upper_lower_count_final += (

413 self._successive_upper_lower_count

414 )

415

416 self._successive_upper_lower_count = 0

417 self._character_count_since_last_sep = 0

418 self._last_alpha_seen = None

419 self._buf = False

420 self._character_count += 1

421 self._current_ascii_only = True

422

423 return

424

425 if self._current_ascii_only is True and character.isascii() is False:

426 self._current_ascii_only = False

427

428 if self._last_alpha_seen is not None:

429 if (character.isupper() and self._last_alpha_seen.islower()) or (

430 character.islower() and self._last_alpha_seen.isupper()

431 ):

432 if self._buf is True:

433 self._successive_upper_lower_count += 2

434 self._buf = False

435 else:

436 self._buf = True

437 else:

438 self._buf = False

439

440 self._character_count += 1

441 self._character_count_since_last_sep += 1

442 self._last_alpha_seen = character

443

444 def reset(self) -> None: # pragma: no cover

445 self._character_count = 0

446 self._character_count_since_last_sep = 0

447 self._successive_upper_lower_count = 0

448 self._successive_upper_lower_count_final = 0

449 self._last_alpha_seen = None

450 self._buf = False

451 self._current_ascii_only = True

452

453 @property

454 def ratio(self) -> float:

455 if self._character_count == 0:

456 return 0.0

457

458 return self._successive_upper_lower_count_final / self._character_count

459

460

461class ArabicIsolatedFormPlugin(MessDetectorPlugin):

462 def __init__(self) -> None:

463 self._character_count: int = 0

464 self._isolated_form_count: int = 0

465

466 def reset(self) -> None: # pragma: no cover

467 self._character_count = 0

468 self._isolated_form_count = 0

469

470 def eligible(self, character: str) -> bool:

471 return is_arabic(character)

472

473 def feed(self, character: str) -> None:

474 self._character_count += 1

475

476 if is_arabic_isolated_form(character):

477 self._isolated_form_count += 1

478

479 @property

480 def ratio(self) -> float:

481 if self._character_count < 8:

482 return 0.0

483

484 isolated_form_usage: float = self._isolated_form_count / self._character_count

485

486 return isolated_form_usage

487

488

489@lru_cache(maxsize=1024)

490def is_suspiciously_successive_range(

491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]

492) -> bool:

493 """

494 Determine if two Unicode range seen next to each other can be considered as suspicious.

495 """

496 if unicode_range_a is None or unicode_range_b is None:

497 return True

498

499 if unicode_range_a == unicode_range_b:

500 return False

501

502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:

503 return False

504

505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:

506 return False

507

508 # Latin characters can be accompanied with a combining diacritical mark

509 # eg. Vietnamese.

510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (

511 "Combining" in unicode_range_a or "Combining" in unicode_range_b

512 ):

513 return False

514

515 keywords_range_a, keywords_range_b = unicode_range_a.split(

516 " "

517 ), unicode_range_b.split(" ")

518

519 for el in keywords_range_a:

520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:

521 continue

522 if el in keywords_range_b:

523 return False

524

525 # Japanese Exception

526 range_a_jp_chars, range_b_jp_chars = (

527 unicode_range_a

528 in (

529 "Hiragana",

530 "Katakana",

531 ),

532 unicode_range_b in ("Hiragana", "Katakana"),

533 )

534 if (range_a_jp_chars or range_b_jp_chars) and (

535 "CJK" in unicode_range_a or "CJK" in unicode_range_b

536 ):

537 return False

538 if range_a_jp_chars and range_b_jp_chars:

539 return False

540

541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:

542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:

543 return False

544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

545 return False

546

547 # Chinese/Japanese use dedicated range for punctuation and/or separators.

548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (

549 unicode_range_a in ["Katakana", "Hiragana"]

550 and unicode_range_b in ["Katakana", "Hiragana"]

551 ):

552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:

553 return False

554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:

555 return False

556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

557 return False

558

559 return True

560

561

562@lru_cache(maxsize=2048)

563def mess_ratio(

564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False

565) -> float:

566 """

567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.

568 """

569

570 detectors: List[MessDetectorPlugin] = [

571 md_class() for md_class in MessDetectorPlugin.__subclasses__()

572 ]

573

574 length: int = len(decoded_sequence) + 1

575

576 mean_mess_ratio: float = 0.0

577

578 if length < 512:

579 intermediary_mean_mess_ratio_calc: int = 32

580 elif length <= 1024:

581 intermediary_mean_mess_ratio_calc = 64

582 else:

583 intermediary_mean_mess_ratio_calc = 128

584

585 for character, index in zip(decoded_sequence + "\n", range(length)):

586 for detector in detectors:

587 if detector.eligible(character):

588 detector.feed(character)

589

590 if (

591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0

592 ) or index == length - 1:

593 mean_mess_ratio = sum(dt.ratio for dt in detectors)

594

595 if mean_mess_ratio >= maximum_threshold:

596 break

597

598 if debug:

599 logger = getLogger("charset_normalizer")

600

601 logger.log(

602 TRACE,

603 "Mess-detector extended-analysis start. "

604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "

605 f"maximum_threshold={maximum_threshold}",

606 )

607

608 if len(decoded_sequence) > 16:

609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")

610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

611

612 for dt in detectors: # pragma: nocover

613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

614

615 return round(mean_mess_ratio, 3)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/md.py: 23%

301 statements