Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3import sys

4from functools import lru_cache

5from logging import getLogger

7if sys.version_info >= (3, 8):

8 from typing import final

9else:

10 try:

11 from typing_extensions import final

12 except ImportError:

14 def final(cls): # type: ignore[misc,no-untyped-def]

15 return cls

18from .constant import (

19 COMMON_CJK_CHARACTERS,

20 COMMON_SAFE_ASCII_CHARACTERS,

21 TRACE,

22 UNICODE_SECONDARY_RANGE_KEYWORD,

23 _ACCENTUATED,

24 _ARABIC,

25 _ARABIC_ISOLATED_FORM,

26 _CJK,

27 _HANGUL,

28 _HIRAGANA,

29 _KATAKANA,

30 _LATIN,

31 _THAI,

32)

33from .utils import (

34 _character_flags,

35 is_emoticon,

36 is_punctuation,

37 is_separator,

38 is_symbol,

39 remove_accent,

40 unicode_range,

41)

43# Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.

44_GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI

47@final

48class CharInfo:

49 """Pre-computed character properties shared across all detectors.

51 Instantiated once and reused via :meth:`update` on every character

52 in the hot loop so that redundant calls to str methods

53 (``isalpha``, ``isupper``, …) and cached utility functions

54 (``_character_flags``, ``is_punctuation``, …) are avoided when

55 several plugins need the same information.

56 """

58 __slots__ = (

59 "character",

60 "printable",

61 "alpha",

62 "upper",

63 "lower",

64 "space",

65 "digit",

66 "is_ascii",

67 "case_variable",

68 "flags",

69 "accentuated",

70 "latin",

71 "is_cjk",

72 "is_arabic",

73 "is_glyph",

74 "punct",

75 "sym",

76 )

78 def __init__(self) -> None:

79 self.character: str = ""

80 self.printable: bool = False

81 self.alpha: bool = False

82 self.upper: bool = False

83 self.lower: bool = False

84 self.space: bool = False

85 self.digit: bool = False

86 self.is_ascii: bool = False

87 self.case_variable: bool = False

88 self.flags: int = 0

89 self.accentuated: bool = False

90 self.latin: bool = False

91 self.is_cjk: bool = False

92 self.is_arabic: bool = False

93 self.is_glyph: bool = False

94 self.punct: bool = False

95 self.sym: bool = False

97 def update(self, character: str) -> None:

98 """Update all properties for *character* (called once per character)."""

99 self.character = character

100

101 # ASCII fast-path: for characters with ord < 128, we can skip

102 # _character_flags() entirely and derive most properties from ord.

103 o: int = ord(character)

104 if o < 128:

105 self.is_ascii = True

106 self.accentuated = False

107 self.is_cjk = False

108 self.is_arabic = False

109 self.is_glyph = False

110 # ASCII alpha: a-z (97-122) or A-Z (65-90)

111 if 65 <= o <= 90:

112 # Uppercase ASCII letter

113 self.alpha = True

114 self.upper = True

115 self.lower = False

116 self.space = False

117 self.digit = False

118 self.printable = True

119 self.case_variable = True

120 self.flags = _LATIN

121 self.latin = True

122 self.punct = False

123 self.sym = False

124 elif 97 <= o <= 122:

125 # Lowercase ASCII letter

126 self.alpha = True

127 self.upper = False

128 self.lower = True

129 self.space = False

130 self.digit = False

131 self.printable = True

132 self.case_variable = True

133 self.flags = _LATIN

134 self.latin = True

135 self.punct = False

136 self.sym = False

137 elif 48 <= o <= 57:

138 # ASCII digit 0-9

139 self.alpha = False

140 self.upper = False

141 self.lower = False

142 self.space = False

143 self.digit = True

144 self.printable = True

145 self.case_variable = False

146 self.flags = 0

147 self.latin = False

148 self.punct = False

149 self.sym = False

150 elif o == 32 or (9 <= o <= 13):

151 # Space, tab, newline, etc.

152 self.alpha = False

153 self.upper = False

154 self.lower = False

155 self.space = True

156 self.digit = False

157 self.printable = o == 32

158 self.case_variable = False

159 self.flags = 0

160 self.latin = False

161 self.punct = False

162 self.sym = False

163 else:

164 # Other ASCII (punctuation, symbols, control chars)

165 self.printable = character.isprintable()

166 self.alpha = False

167 self.upper = False

168 self.lower = False

169 self.space = False

170 self.digit = False

171 self.case_variable = False

172 self.flags = 0

173 self.latin = False

174 self.punct = is_punctuation(character) if self.printable else False

175 self.sym = is_symbol(character) if self.printable else False

176 else:

177 # Non-ASCII path

178 self.is_ascii = False

179 self.printable = character.isprintable()

180 self.alpha = character.isalpha()

181 self.upper = character.isupper()

182 self.lower = character.islower()

183 self.space = character.isspace()

184 self.digit = character.isdigit()

185 self.case_variable = self.lower != self.upper

186

187 # Flag-based classification (single unicodedata.name() call, lru-cached)

188 flags: int

189 if self.alpha:

190 flags = _character_flags(character)

191 else:

192 flags = 0

193 self.flags = flags

194 self.accentuated = bool(flags & _ACCENTUATED)

195 self.latin = bool(flags & _LATIN)

196 self.is_cjk = bool(flags & _CJK)

197 self.is_arabic = bool(flags & _ARABIC)

198 self.is_glyph = bool(flags & _GLYPH_MASK)

199

200 # Eagerly compute punct and sym (avoids property dispatch overhead

201 # on 300K+ accesses in the hot loop).

202 self.punct = is_punctuation(character) if self.printable else False

203 self.sym = is_symbol(character) if self.printable else False

204

205

206class MessDetectorPlugin:

207 """

208 Base abstract class used for mess detection plugins.

209 All detectors MUST extend and implement given methods.

210 """

211

212 __slots__ = ()

213

214 def feed_info(self, character: str, info: CharInfo) -> None:

215 """

216 The main routine to be executed upon character.

217 Insert the logic in witch the text would be considered chaotic.

218 """

219 raise NotImplementedError # Defensive:

220

221 def reset(self) -> None: # Defensive:

222 """

223 Permit to reset the plugin to the initial state.

224 """

225 raise NotImplementedError

226

227 @property

228 def ratio(self) -> float:

229 """

230 Compute the chaos ratio based on what your feed() has seen.

231 Must NOT be lower than 0.; No restriction gt 0.

232 """

233 raise NotImplementedError # Defensive:

234

235

236@final

237class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

238 __slots__ = (

239 "_punctuation_count",

240 "_symbol_count",

241 "_character_count",

242 "_last_printable_char",

243 "_frenzy_symbol_in_word",

244 )

245

246 def __init__(self) -> None:

247 self._punctuation_count: int = 0

248 self._symbol_count: int = 0

249 self._character_count: int = 0

250

251 self._last_printable_char: str | None = None

252 self._frenzy_symbol_in_word: bool = False

253

254 def feed_info(self, character: str, info: CharInfo) -> None:

255 """Optimized feed using pre-computed character info."""

256 self._character_count += 1

257

258 if (

259 character != self._last_printable_char

260 and character not in COMMON_SAFE_ASCII_CHARACTERS

261 ):

262 if info.punct:

263 self._punctuation_count += 1

264 elif not info.digit and info.sym and not is_emoticon(character):

265 self._symbol_count += 2

266

267 self._last_printable_char = character

268

269 def reset(self) -> None: # Abstract

270 self._punctuation_count = 0

271 self._character_count = 0

272 self._symbol_count = 0

273

274 @property

275 def ratio(self) -> float:

276 if self._character_count == 0:

277 return 0.0

278

279 ratio_of_punctuation: float = (

280 self._punctuation_count + self._symbol_count

281 ) / self._character_count

282

283 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

284

285

286@final

287class TooManyAccentuatedPlugin(MessDetectorPlugin):

288 __slots__ = ("_character_count", "_accentuated_count")

289

290 def __init__(self) -> None:

291 self._character_count: int = 0

292 self._accentuated_count: int = 0

293

294 def feed_info(self, character: str, info: CharInfo) -> None:

295 """Optimized feed using pre-computed character info."""

296 self._character_count += 1

297

298 if info.accentuated:

299 self._accentuated_count += 1

300

301 def reset(self) -> None: # Abstract

302 self._character_count = 0

303 self._accentuated_count = 0

304

305 @property

306 def ratio(self) -> float:

307 if self._character_count < 8:

308 return 0.0

309

310 ratio_of_accentuation: float = self._accentuated_count / self._character_count

311 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

312

313

314@final

315class UnprintablePlugin(MessDetectorPlugin):

316 __slots__ = ("_unprintable_count", "_character_count")

317

318 def __init__(self) -> None:

319 self._unprintable_count: int = 0

320 self._character_count: int = 0

321

322 def feed_info(self, character: str, info: CharInfo) -> None:

323 """Optimized feed using pre-computed character info."""

324 if (

325 not info.space

326 and not info.printable

327 and character != "\x1a"

328 and character != "\ufeff"

329 ):

330 self._unprintable_count += 1

331 self._character_count += 1

332

333 def reset(self) -> None: # Abstract

334 self._unprintable_count = 0

335

336 @property

337 def ratio(self) -> float:

338 if self._character_count == 0: # Defensive:

339 return 0.0

340

341 return (self._unprintable_count * 8) / self._character_count

342

343

344@final

345class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

346 __slots__ = (

347 "_successive_count",

348 "_character_count",

349 "_last_latin_character",

350 "_last_was_accentuated",

351 )

352

353 def __init__(self) -> None:

354 self._successive_count: int = 0

355 self._character_count: int = 0

356

357 self._last_latin_character: str | None = None

358 self._last_was_accentuated: bool = False

359

360 def feed_info(self, character: str, info: CharInfo) -> None:

361 """Optimized feed using pre-computed character info."""

362 self._character_count += 1

363 if (

364 self._last_latin_character is not None

365 and info.accentuated

366 and self._last_was_accentuated

367 ):

368 if info.upper and self._last_latin_character.isupper():

369 self._successive_count += 1

370 if remove_accent(character) == remove_accent(self._last_latin_character):

371 self._successive_count += 1

372 self._last_latin_character = character

373 self._last_was_accentuated = info.accentuated

374

375 def reset(self) -> None: # Abstract

376 self._successive_count = 0

377 self._character_count = 0

378 self._last_latin_character = None

379 self._last_was_accentuated = False

380

381 @property

382 def ratio(self) -> float:

383 if self._character_count == 0:

384 return 0.0

385

386 return (self._successive_count * 2) / self._character_count

387

388

389@final

390class SuspiciousRange(MessDetectorPlugin):

391 __slots__ = (

392 "_suspicious_successive_range_count",

393 "_character_count",

394 "_last_printable_seen",

395 "_last_printable_range",

396 )

397

398 def __init__(self) -> None:

399 self._suspicious_successive_range_count: int = 0

400 self._character_count: int = 0

401 self._last_printable_seen: str | None = None

402 self._last_printable_range: str | None = None

403

404 def feed_info(self, character: str, info: CharInfo) -> None:

405 """Optimized feed using pre-computed character info."""

406 self._character_count += 1

407

408 if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:

409 self._last_printable_seen = None

410 self._last_printable_range = None

411 return

412

413 if self._last_printable_seen is None:

414 self._last_printable_seen = character

415 self._last_printable_range = unicode_range(character)

416 return

417

418 unicode_range_a: str | None = self._last_printable_range

419 unicode_range_b: str | None = unicode_range(character)

420

421 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):

422 self._suspicious_successive_range_count += 1

423

424 self._last_printable_seen = character

425 self._last_printable_range = unicode_range_b

426

427 def reset(self) -> None: # Abstract

428 self._character_count = 0

429 self._suspicious_successive_range_count = 0

430 self._last_printable_seen = None

431 self._last_printable_range = None

432

433 @property

434 def ratio(self) -> float:

435 if self._character_count <= 13:

436 return 0.0

437

438 ratio_of_suspicious_range_usage: float = (

439 self._suspicious_successive_range_count * 2

440 ) / self._character_count

441

442 return ratio_of_suspicious_range_usage

443

444

445@final

446class SuperWeirdWordPlugin(MessDetectorPlugin):

447 __slots__ = (

448 "_word_count",

449 "_bad_word_count",

450 "_foreign_long_count",

451 "_is_current_word_bad",

452 "_foreign_long_watch",

453 "_character_count",

454 "_bad_character_count",

455 "_buffer_length",

456 "_buffer_last_char",

457 "_buffer_last_char_accentuated",

458 "_buffer_accent_count",

459 "_buffer_glyph_count",

460 "_buffer_upper_count",

461 )

462

463 def __init__(self) -> None:

464 self._word_count: int = 0

465 self._bad_word_count: int = 0

466 self._foreign_long_count: int = 0

467

468 self._is_current_word_bad: bool = False

469 self._foreign_long_watch: bool = False

470

471 self._character_count: int = 0

472 self._bad_character_count: int = 0

473

474 self._buffer_length: int = 0

475 self._buffer_last_char: str | None = None

476 self._buffer_last_char_accentuated: bool = False

477 self._buffer_accent_count: int = 0

478 self._buffer_glyph_count: int = 0

479 self._buffer_upper_count: int = 0

480

481 def feed_info(self, character: str, info: CharInfo) -> None:

482 """Optimized feed using pre-computed character info."""

483 if info.alpha:

484 self._buffer_length += 1

485 self._buffer_last_char = character

486

487 if info.upper:

488 self._buffer_upper_count += 1

489

490 self._buffer_last_char_accentuated = info.accentuated

491

492 if info.accentuated:

493 self._buffer_accent_count += 1

494 if (

495 not self._foreign_long_watch

496 and (not info.latin or info.accentuated)

497 and not info.is_glyph

498 ):

499 self._foreign_long_watch = True

500 if info.is_glyph:

501 self._buffer_glyph_count += 1

502 return

503 if not self._buffer_length:

504 return

505 if info.space or info.punct or is_separator(character):

506 self._word_count += 1

507 buffer_length: int = self._buffer_length

508

509 self._character_count += buffer_length

510

511 if buffer_length >= 4:

512 if self._buffer_accent_count / buffer_length >= 0.5:

513 self._is_current_word_bad = True

514 elif (

515 self._buffer_last_char_accentuated

516 and self._buffer_last_char.isupper() # type: ignore[union-attr]

517 and self._buffer_upper_count != buffer_length

518 ):

519 self._foreign_long_count += 1

520 self._is_current_word_bad = True

521 elif self._buffer_glyph_count == 1:

522 self._is_current_word_bad = True

523 self._foreign_long_count += 1

524 if buffer_length >= 24 and self._foreign_long_watch:

525 probable_camel_cased: bool = (

526 self._buffer_upper_count > 0

527 and self._buffer_upper_count / buffer_length <= 0.3

528 )

529

530 if not probable_camel_cased:

531 self._foreign_long_count += 1

532 self._is_current_word_bad = True

533

534 if self._is_current_word_bad:

535 self._bad_word_count += 1

536 self._bad_character_count += buffer_length

537 self._is_current_word_bad = False

538

539 self._foreign_long_watch = False

540 self._buffer_length = 0

541 self._buffer_last_char = None

542 self._buffer_last_char_accentuated = False

543 self._buffer_accent_count = 0

544 self._buffer_glyph_count = 0

545 self._buffer_upper_count = 0

546 elif (

547 character not in {"<", ">", "-", "=", "~", "|", "_"}

548 and not info.digit

549 and info.sym

550 ):

551 self._is_current_word_bad = True

552 self._buffer_length += 1

553 self._buffer_last_char = character

554 self._buffer_last_char_accentuated = False

555

556 def reset(self) -> None: # Abstract

557 self._buffer_length = 0

558 self._buffer_last_char = None

559 self._buffer_last_char_accentuated = False

560 self._is_current_word_bad = False

561 self._foreign_long_watch = False

562 self._bad_word_count = 0

563 self._word_count = 0

564 self._character_count = 0

565 self._bad_character_count = 0

566 self._foreign_long_count = 0

567 self._buffer_accent_count = 0

568 self._buffer_glyph_count = 0

569 self._buffer_upper_count = 0

570

571 @property

572 def ratio(self) -> float:

573 if self._word_count <= 10 and self._foreign_long_count == 0:

574 return 0.0

575

576 return self._bad_character_count / self._character_count

577

578

579@final

580class CjkUncommonPlugin(MessDetectorPlugin):

581 """

582 Detect messy CJK text that probably means nothing.

583 """

584

585 __slots__ = ("_character_count", "_uncommon_count")

586

587 def __init__(self) -> None:

588 self._character_count: int = 0

589 self._uncommon_count: int = 0

590

591 def feed_info(self, character: str, info: CharInfo) -> None:

592 """Optimized feed using pre-computed character info."""

593 self._character_count += 1

594

595 if character not in COMMON_CJK_CHARACTERS:

596 self._uncommon_count += 1

597

598 def reset(self) -> None: # Abstract

599 self._character_count = 0

600 self._uncommon_count = 0

601

602 @property

603 def ratio(self) -> float:

604 if self._character_count < 8:

605 return 0.0

606

607 uncommon_form_usage: float = self._uncommon_count / self._character_count

608

609 # we can be pretty sure it's garbage when uncommon characters are widely

610 # used. otherwise it could just be traditional chinese for example.

611 return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0

612

613

614@final

615class ArchaicUpperLowerPlugin(MessDetectorPlugin):

616 __slots__ = (

617 "_buf",

618 "_character_count_since_last_sep",

619 "_successive_upper_lower_count",

620 "_successive_upper_lower_count_final",

621 "_character_count",

622 "_last_alpha_seen",

623 "_last_alpha_seen_upper",

624 "_last_alpha_seen_lower",

625 "_current_ascii_only",

626 )

627

628 def __init__(self) -> None:

629 self._buf: bool = False

630

631 self._character_count_since_last_sep: int = 0

632

633 self._successive_upper_lower_count: int = 0

634 self._successive_upper_lower_count_final: int = 0

635

636 self._character_count: int = 0

637

638 self._last_alpha_seen: str | None = None

639 self._last_alpha_seen_upper: bool = False

640 self._last_alpha_seen_lower: bool = False

641 self._current_ascii_only: bool = True

642

643 def feed_info(self, character: str, info: CharInfo) -> None:

644 """Optimized feed using pre-computed character info."""

645 is_concerned: bool = info.alpha and info.case_variable

646 chunk_sep: bool = not is_concerned

647

648 if chunk_sep and self._character_count_since_last_sep > 0:

649 if (

650 self._character_count_since_last_sep <= 64

651 and not info.digit

652 and not self._current_ascii_only

653 ):

654 self._successive_upper_lower_count_final += (

655 self._successive_upper_lower_count

656 )

657

658 self._successive_upper_lower_count = 0

659 self._character_count_since_last_sep = 0

660 self._last_alpha_seen = None

661 self._buf = False

662 self._character_count += 1

663 self._current_ascii_only = True

664

665 return

666

667 if self._current_ascii_only and not info.is_ascii:

668 self._current_ascii_only = False

669

670 if self._last_alpha_seen is not None:

671 if (info.upper and self._last_alpha_seen_lower) or (

672 info.lower and self._last_alpha_seen_upper

673 ):

674 if self._buf:

675 self._successive_upper_lower_count += 2

676 self._buf = False

677 else:

678 self._buf = True

679 else:

680 self._buf = False

681

682 self._character_count += 1

683 self._character_count_since_last_sep += 1

684 self._last_alpha_seen = character

685 self._last_alpha_seen_upper = info.upper

686 self._last_alpha_seen_lower = info.lower

687

688 def reset(self) -> None: # Abstract

689 self._character_count = 0

690 self._character_count_since_last_sep = 0

691 self._successive_upper_lower_count = 0

692 self._successive_upper_lower_count_final = 0

693 self._last_alpha_seen = None

694 self._last_alpha_seen_upper = False

695 self._last_alpha_seen_lower = False

696 self._buf = False

697 self._current_ascii_only = True

698

699 @property

700 def ratio(self) -> float:

701 if self._character_count == 0: # Defensive:

702 return 0.0

703

704 return self._successive_upper_lower_count_final / self._character_count

705

706

707@final

708class ArabicIsolatedFormPlugin(MessDetectorPlugin):

709 __slots__ = ("_character_count", "_isolated_form_count")

710

711 def __init__(self) -> None:

712 self._character_count: int = 0

713 self._isolated_form_count: int = 0

714

715 def reset(self) -> None: # Abstract

716 self._character_count = 0

717 self._isolated_form_count = 0

718

719 def feed_info(self, character: str, info: CharInfo) -> None:

720 """Optimized feed using pre-computed character info."""

721 self._character_count += 1

722

723 if info.flags & _ARABIC_ISOLATED_FORM:

724 self._isolated_form_count += 1

725

726 @property

727 def ratio(self) -> float:

728 if self._character_count < 8:

729 return 0.0

730

731 isolated_form_usage: float = self._isolated_form_count / self._character_count

732

733 return isolated_form_usage

734

735

736@lru_cache(maxsize=1024)

737def is_suspiciously_successive_range(

738 unicode_range_a: str | None, unicode_range_b: str | None

739) -> bool:

740 """

741 Determine if two Unicode range seen next to each other can be considered as suspicious.

742 """

743 if unicode_range_a is None or unicode_range_b is None:

744 return True

745

746 if unicode_range_a == unicode_range_b:

747 return False

748

749 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:

750 return False

751

752 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:

753 return False

754

755 # Latin characters can be accompanied with a combining diacritical mark

756 # eg. Vietnamese.

757 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (

758 "Combining" in unicode_range_a or "Combining" in unicode_range_b

759 ):

760 return False

761

762 keywords_range_a, keywords_range_b = (

763 unicode_range_a.split(" "),

764 unicode_range_b.split(" "),

765 )

766

767 for el in keywords_range_a:

768 if el in UNICODE_SECONDARY_RANGE_KEYWORD:

769 continue

770 if el in keywords_range_b:

771 return False

772

773 # Japanese Exception

774 range_a_jp_chars, range_b_jp_chars = (

775 unicode_range_a

776 in (

777 "Hiragana",

778 "Katakana",

779 ),

780 unicode_range_b in ("Hiragana", "Katakana"),

781 )

782 if (range_a_jp_chars or range_b_jp_chars) and (

783 "CJK" in unicode_range_a or "CJK" in unicode_range_b

784 ):

785 return False

786 if range_a_jp_chars and range_b_jp_chars:

787 return False

788

789 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:

790 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:

791 return False

792 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

793 return False

794

795 # Chinese/Japanese use dedicated range for punctuation and/or separators.

796 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (

797 unicode_range_a in ["Katakana", "Hiragana"]

798 and unicode_range_b in ["Katakana", "Hiragana"]

799 ):

800 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:

801 return False

802 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:

803 return False

804 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":

805 return False

806

807 return True

808

809

810@lru_cache(maxsize=2048)

811def mess_ratio(

812 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False

813) -> float:

814 """

815 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.

816 """

817

818 seq_len: int = len(decoded_sequence)

819

820 if seq_len < 511:

821 step: int = 32

822 elif seq_len < 1024:

823 step = 64

824 else:

825 step = 128

826

827 # Create each detector as a named local variable (unrolled from the generic loop).

828 # This eliminates per-character iteration over the detector list and

829 # per-character eligible() virtual dispatch, while keeping every plugin class

830 # intact and fully readable.

831 d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()

832 d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()

833 d_up: UnprintablePlugin = UnprintablePlugin()

834 d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()

835 d_sr: SuspiciousRange = SuspiciousRange()

836 d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()

837 d_cu: CjkUncommonPlugin = CjkUncommonPlugin()

838 d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()

839 d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()

840

841 # Local references for feed_info methods called in the hot loop.

842 d_sp_feed = d_sp.feed_info

843 d_ta_feed = d_ta.feed_info

844 d_up_feed = d_up.feed_info

845 d_sda_feed = d_sda.feed_info

846 d_sr_feed = d_sr.feed_info

847 d_sw_feed = d_sw.feed_info

848 d_cu_feed = d_cu.feed_info

849 d_au_feed = d_au.feed_info

850 d_ai_feed = d_ai.feed_info

851

852 # Single reusable CharInfo object (avoids per-character allocation).

853 info: CharInfo = CharInfo()

854 info_update = info.update

855

856 mean_mess_ratio: float

857

858 for block_start in range(0, seq_len, step):

859 for character in decoded_sequence[block_start : block_start + step]:

860 # Pre-compute all character properties once (shared across all plugins).

861 info_update(character)

862

863 # Detectors with eligible() == always True

864 d_up_feed(character, info)

865 d_sw_feed(character, info)

866 d_au_feed(character, info)

867

868 # Detectors with eligible() == isprintable

869 if info.printable:

870 d_sp_feed(character, info)

871 d_sr_feed(character, info)

872

873 # Detectors with eligible() == isalpha

874 if info.alpha:

875 d_ta_feed(character, info)

876 # SuspiciousDuplicateAccent: isalpha() and is_latin()

877 if info.latin:

878 d_sda_feed(character, info)

879 # CjkUncommon: is_cjk()

880 if info.is_cjk:

881 d_cu_feed(character, info)

882 # ArabicIsolatedForm: is_arabic()

883 if info.is_arabic:

884 d_ai_feed(character, info)

885

886 mean_mess_ratio = (

887 d_sp.ratio

888 + d_ta.ratio

889 + d_up.ratio

890 + d_sda.ratio

891 + d_sr.ratio

892 + d_sw.ratio

893 + d_cu.ratio

894 + d_au.ratio

895 + d_ai.ratio

896 )

897

898 if mean_mess_ratio >= maximum_threshold:

899 break

900 else:

901 # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.

902 info_update("\n")

903 d_sw_feed("\n", info)

904 d_au_feed("\n", info)

905 d_up_feed("\n", info)

906

907 mean_mess_ratio = (

908 d_sp.ratio

909 + d_ta.ratio

910 + d_up.ratio

911 + d_sda.ratio

912 + d_sr.ratio

913 + d_sw.ratio

914 + d_cu.ratio

915 + d_au.ratio

916 + d_ai.ratio

917 )

918

919 if debug: # Defensive:

920 logger = getLogger("charset_normalizer")

921

922 logger.log(

923 TRACE,

924 "Mess-detector extended-analysis start. "

925 f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "

926 f"maximum_threshold={maximum_threshold}",

927 )

928

929 if seq_len > 16:

930 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")

931 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

932

933 for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:

934 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

935

936 return round(mean_mess_ratio, 3)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/md.py: 89%

533 statements