Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3from functools import lru_cache

4from logging import getLogger

6from .constant import (

7 COMMON_SAFE_ASCII_CHARACTERS,

8 TRACE,

9 UNICODE_SECONDARY_RANGE_KEYWORD,

10)

11from .utils import (

12 is_accentuated,

13 is_arabic,

14 is_arabic_isolated_form,

15 is_case_variable,

16 is_cjk,

17 is_emoticon,

18 is_hangul,

19 is_hiragana,

20 is_katakana,

21 is_latin,

22 is_punctuation,

23 is_separator,

24 is_symbol,

25 is_thai,

26 is_unprintable,

27 remove_accent,

28 unicode_range,

29 is_cjk_uncommon,

30)

33class MessDetectorPlugin:

34 """

35 Base abstract class used for mess detection plugins.

36 All detectors MUST extend and implement given methods.

37 """

39 def eligible(self, character: str) -> bool:

40 """

41 Determine if given character should be fed in.

42 """

43 raise NotImplementedError # pragma: nocover

45 def feed(self, character: str) -> None:

46 """

47 The main routine to be executed upon character.

48 Insert the logic in witch the text would be considered chaotic.

49 """

50 raise NotImplementedError # pragma: nocover

52 def reset(self) -> None: # pragma: no cover

53 """

54 Permit to reset the plugin to the initial state.

55 """

56 raise NotImplementedError

58 @property

59 def ratio(self) -> float:

60 """

61 Compute the chaos ratio based on what your feed() has seen.

62 Must NOT be lower than 0.; No restriction gt 0.

63 """

64 raise NotImplementedError # pragma: nocover

67class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

68 def __init__(self) -> None:

69 self._punctuation_count: int = 0

70 self._symbol_count: int = 0

71 self._character_count: int = 0

73 self._last_printable_char: str | None = None

74 self._frenzy_symbol_in_word: bool = False

76 def eligible(self, character: str) -> bool:

77 return character.isprintable()

79 def feed(self, character: str) -> None:

80 self._character_count += 1

82 if (

83 character != self._last_printable_char

84 and character not in COMMON_SAFE_ASCII_CHARACTERS

85 ):

86 if is_punctuation(character):

87 self._punctuation_count += 1

88 elif (

89 character.isdigit() is False

90 and is_symbol(character)

91 and is_emoticon(character) is False

92 ):

93 self._symbol_count += 2

95 self._last_printable_char = character

97 def reset(self) -> None: # Abstract

98 self._punctuation_count = 0

99 self._character_count = 0

100 self._symbol_count = 0

101

102 @property

103 def ratio(self) -> float:

104 if self._character_count == 0:

105 return 0.0

106

107 ratio_of_punctuation: float = (

108 self._punctuation_count + self._symbol_count

109 ) / self._character_count

110

111 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

112

113

114class TooManyAccentuatedPlugin(MessDetectorPlugin):

115 def __init__(self) -> None:

116 self._character_count: int = 0

117 self._accentuated_count: int = 0

118

119 def eligible(self, character: str) -> bool:

120 return character.isalpha()

121

122 def feed(self, character: str) -> None:

123 self._character_count += 1

124

125 if is_accentuated(character):

126 self._accentuated_count += 1

127

128 def reset(self) -> None: # Abstract

129 self._character_count = 0

130 self._accentuated_count = 0

131

132 @property

133 def ratio(self) -> float:

134 if self._character_count < 8:

135 return 0.0

136

137 ratio_of_accentuation: float = self._accentuated_count / self._character_count

138 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

139

140

141class UnprintablePlugin(MessDetectorPlugin):

142 def __init__(self) -> None:

143 self._unprintable_count: int = 0

144 self._character_count: int = 0

145

146 def eligible(self, character: str) -> bool:

147 return True

148

149 def feed(self, character: str) -> None:

150 if is_unprintable(character):

151 self._unprintable_count += 1

152 self._character_count += 1

153

154 def reset(self) -> None: # Abstract

155 self._unprintable_count = 0

156

157 @property

158 def ratio(self) -> float:

159 if self._character_count == 0:

160 return 0.0

161

162 return (self._unprintable_count * 8) / self._character_count

163

164

165class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

166 def __init__(self) -> None:

167 self._successive_count: int = 0

168 self._character_count: int = 0

169

170 self._last_latin_character: str | None = None

171

172 def eligible(self, character: str) -> bool:

173 return character.isalpha() and is_latin(character)

174

175 def feed(self, character: str) -> None:

176 self._character_count += 1

177 if (

178 self._last_latin_character is not None

179 and is_accentuated(character)

180 and is_accentuated(self._last_latin_character)

181 ):

182 if character.isupper() and self._last_latin_character.isupper():

183 self._successive_count += 1

184 # Worse if its the same char duplicated with different accent.

185 if remove_accent(character) == remove_accent(self._last_latin_character):

186 self._successive_count += 1

187 self._last_latin_character = character

188

189 def reset(self) -> None: # Abstract

190 self._successive_count = 0

191 self._character_count = 0

192 self._last_latin_character = None

193

194 @property

195 def ratio(self) -> float:

196 if self._character_count == 0:

197 return 0.0

198

199 return (self._successive_count * 2) / self._character_count

200

201

202class SuspiciousRange(MessDetectorPlugin):

203 def __init__(self) -> None:

204 self._suspicious_successive_range_count: int = 0

205 self._character_count: int = 0

206 self._last_printable_seen: str | None = None

207

208 def eligible(self, character: str) -> bool:

209 return character.isprintable()

210

211 def feed(self, character: str) -> None:

212 self._character_count += 1

213

214 if (

215 character.isspace()

216 or is_punctuation(character)

217 or character in COMMON_SAFE_ASCII_CHARACTERS

218 ):

219 self._last_printable_seen = None

220 return

221

222 if self._last_printable_seen is None:

223 self._last_printable_seen = character

224 return

225

226 unicode_range_a: str | None = unicode_range(self._last_printable_seen)

227 unicode_range_b: str | None = unicode_range(character)

228

229 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):

230 self._suspicious_successive_range_count += 1

231

232 self._last_printable_seen = character

233

234 def reset(self) -> None: # Abstract

235 self._character_count = 0

236 self._suspicious_successive_range_count = 0

237 self._last_printable_seen = None

238

239 @property

240 def ratio(self) -> float:

241 if self._character_count <= 13:

242 return 0.0

243

244 ratio_of_suspicious_range_usage: float = (

245 self._suspicious_successive_range_count * 2

246 ) / self._character_count

247

248 return ratio_of_suspicious_range_usage

249

250

251class SuperWeirdWordPlugin(MessDetectorPlugin):

252 def __init__(self) -> None:

253 self._word_count: int = 0

254 self._bad_word_count: int = 0

255 self._foreign_long_count: int = 0

256

257 self._is_current_word_bad: bool = False

258 self._foreign_long_watch: bool = False

259

260 self._character_count: int = 0

261 self._bad_character_count: int = 0

262

263 self._buffer: str = ""

264 self._buffer_accent_count: int = 0

265 self._buffer_glyph_count: int = 0

266

267 def eligible(self, character: str) -> bool:

268 return True

269

270 def feed(self, character: str) -> None:

271 if character.isalpha():

272 self._buffer += character

273 if is_accentuated(character):

274 self._buffer_accent_count += 1

275 if (

276 self._foreign_long_watch is False

277 and (is_latin(character) is False or is_accentuated(character))

278 and is_cjk(character) is False

279 and is_hangul(character) is False

280 and is_katakana(character) is False

281 and is_hiragana(character) is False

282 and is_thai(character) is False

283 ):

284 self._foreign_long_watch = True

285 if (

286 is_cjk(character)

287 or is_hangul(character)

288 or is_katakana(character)

289 or is_hiragana(character)

290 or is_thai(character)

291 ):

292 self._buffer_glyph_count += 1

293 return

294 if not self._buffer:

295 return

296 if (

297 character.isspace() or is_punctuation(character) or is_separator(character)

298 ) and self._buffer:

299 self._word_count += 1

300 buffer_length: int = len(self._buffer)

301

302 self._character_count += buffer_length

303

304 if buffer_length >= 4:

305 if self._buffer_accent_count / buffer_length >= 0.5:

306 self._is_current_word_bad = True

307 # Word/Buffer ending with an upper case accentuated letter are so rare,

308 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.

309 elif (

310 is_accentuated(self._buffer[-1])

311 and self._buffer[-1].isupper()

312 and all(_.isupper() for _ in self._buffer) is False

313 ):

314 self._foreign_long_count += 1

315 self._is_current_word_bad = True

316 elif self._buffer_glyph_count == 1:

317 self._is_current_word_bad = True

318 self._foreign_long_count += 1

319 if buffer_length >= 24 and self._foreign_long_watch:

320 camel_case_dst = [

321 i

322 for c, i in zip(self._buffer, range(0, buffer_length))

323 if c.isupper()

324 ]

325 probable_camel_cased: bool = False

326

327 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):

328 probable_camel_cased = True

329

330 if not probable_camel_cased:

331 self._foreign_long_count += 1

332 self._is_current_word_bad = True

333

334 if self._is_current_word_bad:

335 self._bad_word_count += 1

336 self._bad_character_count += len(self._buffer)

337 self._is_current_word_bad = False

338

339 self._foreign_long_watch = False

340 self._buffer = ""

341 self._buffer_accent_count = 0

342 self._buffer_glyph_count = 0

343 elif (

344 character not in {"<", ">", "-", "=", "~", "|", "_"}

345 and character.isdigit() is False

346 and is_symbol(character)

347 ):

348 self._is_current_word_bad = True

349 self._buffer += character

350

351 def reset(self) -> None: # Abstract

352 self._buffer = ""

353 self._is_current_word_bad = False

354 self._foreign_long_watch = False

355 self._bad_word_count = 0

356 self._word_count = 0

357 self._character_count = 0

358 self._bad_character_count = 0

359 self._foreign_long_count = 0

360

361 @property

362 def ratio(self) -> float:

363 if self._word_count <= 10 and self._foreign_long_count == 0:

364 return 0.0

365

366 return self._bad_character_count / self._character_count

367

368

369class CjkUncommonPlugin(MessDetectorPlugin):

370 """

371 Detect messy CJK text that probably means nothing.

372 """

373

374 def __init__(self) -> None:

375 self._character_count: int = 0

376 self._uncommon_count: int = 0

377

378 def eligible(self, character: str) -> bool:

379 return is_cjk(character)

380

381 def feed(self, character: str) -> None:

382 self._character_count += 1

383

384 if is_cjk_uncommon(character):

385 self._uncommon_count += 1

386 return

387

388 def reset(self) -> None: # Abstract

389 self._character_count = 0

390 self._uncommon_count = 0

391

392 @property

393 def ratio(self) -> float:

394 if self._character_count < 8:

395 return 0.0

396

397 uncommon_form_usage: float = self._uncommon_count / self._character_count

398

399 # we can be pretty sure it's garbage when uncommon characters are widely

400 # used. otherwise it could just be traditional chinese for example.

401 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0

402

403

404class ArchaicUpperLowerPlugin(MessDetectorPlugin):

405 def __init__(self) -> None:

406 self._buf: bool = False

407

408 self._character_count_since_last_sep: int = 0

409

410 self._successive_upper_lower_count: int = 0

411 self._successive_upper_lower_count_final: int = 0

412

413 self._character_count: int = 0

414

415 self._last_alpha_seen: str | None = None

416 self._current_ascii_only: bool = True

417

418 def eligible(self, character: str) -> bool:

419 return True

420

421 def feed(self, character: str) -> None:

422 is_concerned = character.isalpha() and is_case_variable(character)

423 chunk_sep = is_concerned is False

424

425 if chunk_sep and self._character_count_since_last_sep > 0:

426 if (

427 self._character_count_since_last_sep <= 64

428 and character.isdigit() is False

429 and self._current_ascii_only is False

430 ):

431 self._successive_upper_lower_count_final += (

432 self._successive_upper_lower_count

433 )

434

435 self._successive_upper_lower_count = 0

436 self._character_count_since_last_sep = 0

437 self._last_alpha_seen = None

438 self._buf = False

439 self._character_count += 1

440 self._current_ascii_only = True

441

442 return

443

444 if self._current_ascii_only is True and character.isascii() is False:

445 self._current_ascii_only = False

446

447 if self._last_alpha_seen is not None:

448 if (character.isupper() and self._last_alpha_seen.islower()) or (

449 character.islower() and self._last_alpha_seen.isupper()

450 ):

451 if self._buf is True:

452 self._successive_upper_lower_count += 2

453 self._buf = False

454 else:

455 self._buf = True

456 else:

457 self._buf = False

458

459 self._character_count += 1

460 self._character_count_since_last_sep += 1

461 self._last_alpha_seen = character

462

463 def reset(self) -> None: # Abstract

464 self._character_count = 0

465 self._character_count_since_last_sep = 0

466 self._successive_upper_lower_count = 0

467 self._successive_upper_lower_count_final = 0

468 self._last_alpha_seen = None

469 self._buf = False

470 self._current_ascii_only = True

471

472 @property

473 def ratio(self) -> float:

474 if self._character_count == 0:

475 return 0.0

476

477 return self._successive_upper_lower_count_final / self._character_count

478

479

480class ArabicIsolatedFormPlugin(MessDetectorPlugin):

481 def __init__(self) -> None:

482 self._character_count: int = 0

483 self._isolated_form_count: int = 0

484

485 def reset(self) -> None: # Abstract

486 self._character_count = 0

487 self._isolated_form_count = 0

488

489 def eligible(self, character: str) -> bool:

490 return is_arabic(character)

491

492 def feed(self, character: str) -> None:

493 self._character_count += 1

494

495 if is_arabic_isolated_form(character):

496 self._isolated_form_count += 1

497

498 @property

499 def ratio(self) -> float:

500 if self._character_count < 8:

501 return 0.0

502

503 isolated_form_usage: float = self._isolated_form_count / self._character_count

504

505 return isolated_form_usage

506

507

508@lru_cache(maxsize=1024)

509def is_suspiciously_successive_range(

510 unicode_range_a: str | None, unicode_range_b: str | None

511) -> bool:

512 """

513 Determine if two Unicode range seen next to each other can be considered as suspicious.

514 """

515 if unicode_range_a is None or unicode_range_b is None:

516 return True

517

518 if unicode_range_a == unicode_range_b:

519 return False

520

521 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:

522 return False

523

524 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:

525 return False

526

527 # Latin characters can be accompanied with a combining diacritical mark

528 # eg. Vietnamese.

529 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (

530 "Combining" in unicode_range_a or "Combining" in unicode_range_b

531 ):

532 return False

533

534 keywords_range_a, keywords_range_b = (

535 unicode_range_a.split(" "),

536 unicode_range_b.split(" "),

537 )

538

539 for el in keywords_range_a:

540 if el in UNICODE_SECONDARY_RANGE_KEYWORD:

541 continue

542 if el in keywords_range_b:

543 return False

544

545 # Japanese Exception

546 range_a_jp_chars, range_b_jp_chars = (

547 unicode_range_a

548 in (

549 "Hiragana",

550 "Katakana",

551 ),

552 unicode_range_b in ("Hiragana", "Katakana"),

553 )

554 if (range_a_jp_chars or range_b_jp_chars) and (

555 "CJK" in unicode_range_a or "CJK" in unicode_range_b

556 ):

557 return False

558 if range_a_jp_chars and range_b_jp_chars:

559 return False

560

561 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:

562 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:

563 return False

564 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

565 return False

566

567 # Chinese/Japanese use dedicated range for punctuation and/or separators.

568 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (

569 unicode_range_a in ["Katakana", "Hiragana"]

570 and unicode_range_b in ["Katakana", "Hiragana"]

571 ):

572 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:

573 return False

574 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:

575 return False

576 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

577 return False

578

579 return True

580

581

582@lru_cache(maxsize=2048)

583def mess_ratio(

584 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False

585) -> float:

586 """

587 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.

588 """

589

590 detectors: list[MessDetectorPlugin] = [

591 md_class() for md_class in MessDetectorPlugin.__subclasses__()

592 ]

593

594 length: int = len(decoded_sequence) + 1

595

596 mean_mess_ratio: float = 0.0

597

598 if length < 512:

599 intermediary_mean_mess_ratio_calc: int = 32

600 elif length <= 1024:

601 intermediary_mean_mess_ratio_calc = 64

602 else:

603 intermediary_mean_mess_ratio_calc = 128

604

605 for character, index in zip(decoded_sequence + "\n", range(length)):

606 for detector in detectors:

607 if detector.eligible(character):

608 detector.feed(character)

609

610 if (

611 index > 0 and index % intermediary_mean_mess_ratio_calc == 0

612 ) or index == length - 1:

613 mean_mess_ratio = sum(dt.ratio for dt in detectors)

614

615 if mean_mess_ratio >= maximum_threshold:

616 break

617

618 if debug:

619 logger = getLogger("charset_normalizer")

620

621 logger.log(

622 TRACE,

623 "Mess-detector extended-analysis start. "

624 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "

625 f"maximum_threshold={maximum_threshold}",

626 )

627

628 if len(decoded_sequence) > 16:

629 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")

630 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

631

632 for dt in detectors:

633 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

634

635 return round(mean_mess_ratio, 3)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/md.py: 88%

350 statements