Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/__init_

1"""

2ftfy: fixes text for you

4This is a module for making text less broken. See the `fix_text` function

5for more information.

6"""

8import unicodedata

9import warnings

10from typing import (

11 Any,

12 Dict,

13 Iterator,

14 List,

15 NamedTuple,

16 Optional,

17 TextIO,

18 Tuple,

19 Union,

20 cast,

21 no_type_check,

22)

24from ftfy import bad_codecs

25from ftfy import chardata, fixes

26from ftfy.badness import is_bad

27from ftfy.formatting import display_ljust

29__version__ = "6.1.2"

32# Though this function does nothing, it lets linters know that we're using

33# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.

34bad_codecs.ok()

37class ExplanationStep(NamedTuple):

38 """

39 A step in an ExplainedText, explaining how to decode text.

41 The possible actions are:

43 - "encode": take in a string and encode it as bytes, with the given encoding

44 - "decode": take in bytes and decode them as a string, with the given encoding

45 - "transcode": convert bytes to bytes with a particular named function

46 - "apply": convert str to str with a particular named function

48 The `parameter` is the name of the encoding or function to use. If it's a

49 function, it must appear in the FIXERS dictionary.

50 """

52 action: str

53 parameter: str

55 def __repr__(self) -> str:

56 """

57 Get the string representation of an ExplanationStep. We output the

58 representation of the equivalent tuple, for simplicity.

59 """

60 return repr(tuple(self))

63class ExplainedText(NamedTuple):

64 """

65 The return type from ftfy's functions that provide an "explanation" of which

66 steps it applied to fix the text, such as :func:`fix_and_explain()`.

68 When the 'explain' option is disabled, these functions return the same

69 type, but the `explanation` will be None.

70 """

72 text: str

73 explanation: Optional[List[ExplanationStep]]

76# Functions that can be applied using `apply_plan`.

77FIXERS = {

78 "unescape_html": fixes.unescape_html,

79 "remove_terminal_escapes": fixes.remove_terminal_escapes,

80 "restore_byte_a0": fixes.restore_byte_a0,

81 "replace_lossy_sequences": fixes.replace_lossy_sequences,

82 "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,

83 "fix_c1_controls": fixes.fix_c1_controls,

84 "fix_latin_ligatures": fixes.fix_latin_ligatures,

85 "fix_character_width": fixes.fix_character_width,

86 "uncurl_quotes": fixes.uncurl_quotes,

87 "fix_line_breaks": fixes.fix_line_breaks,

88 "fix_surrogates": fixes.fix_surrogates,

89 "remove_control_chars": fixes.remove_control_chars,

90}

93class TextFixerConfig(NamedTuple):

94 r"""

95 A TextFixerConfig object stores configuration options for ftfy.

97 It's implemented as a namedtuple with defaults, so you can instantiate

98 it by providing the values to change from their defaults as keyword arguments.

99 For example, to disable 'unescape_html' and keep the rest of the defaults::

100

101 TextFixerConfig(unescape_html=False)

102

103 Here are the options and their default values:

104

105 - `unescape_html`: "auto"

106

107 Configures whether to replace HTML entities such as & with the character

108 they represent. "auto" says to do this by default, but disable it when a

109 literal < character appears, indicating that the input is actual HTML and

110 entities should be preserved. The value can be True, to always enable this

111 fixer, or False, to always disable it.

112

113 - `remove_terminal_escapes`: True

114

115 Removes "ANSI" terminal escapes, such as for changing the color of text in a

116 terminal window.

117

118 - `fix_encoding`: True

119

120 Detect mojibake and attempt to fix it by decoding the text in a different

121 encoding standard.

122

123 The following four options affect `fix_encoding` works, and do nothing if

124 `fix_encoding` is False:

125

126 - `restore_byte_a0`: True

127

128 Allow a literal space (U+20) to be interpreted as a non-breaking space

129 (U+A0) when that would make it part of a fixable mojibake string.

130

131 Because spaces are very common characters, this could lead to false

132 positives, but we try to apply it only when there's strong evidence for

133 mojibake. Disabling `restore_byte_a0` is safer from false positives,

134 but creates false negatives.

135

136 - `replace_lossy_sequences`: True

137

138 Detect mojibake that has been partially replaced by the characters

139 '�' or '?'. If the mojibake could be decoded otherwise, replace the

140 detected sequence with '�'.

141

142 - `decode_inconsistent_utf8`: True

143

144 When we see sequences that distinctly look like UTF-8 mojibake, but

145 there's no consistent way to reinterpret the string in a new encoding,

146 replace the mojibake with the appropriate UTF-8 characters anyway.

147

148 This helps to decode strings that are concatenated from different

149 encodings.

150

151 - `fix_c1_controls`: True

152

153 Replace C1 control characters (the useless characters U+80 - U+9B that

154 come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,

155 even if the whole string doesn't decode as Latin-1.

156

157 - `fix_latin_ligatures`: True

158

159 Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the

160 letters they're made of.

161

162 - `fix_character_width`: True

163

164 Replace fullwidth Latin characters and halfwidth Katakana with

165 their more standard widths.

166

167 - `uncurl_quotes`: True

168

169 Replace curly quotes with straight quotes.

170

171 - `fix_line_breaks`: True

172

173 Replace various forms of line breaks with the standard Unix line

174 break, ``\n``.

175

176 - `fix_surrogates`: True

177

178 Replace sequences of UTF-16 surrogate codepoints with the character

179 they were meant to encode. This fixes text that was decoded with the

180 obsolete UCS-2 standard, and allows it to support high-numbered

181 codepoints such as emoji.

182

183 - `remove_control_chars`: True

184

185 Remove certain control characters that have no displayed effect on text.

186

187 - `normalization`: "NFC"

188

189 Choose what kind of Unicode normalization is applied. Usually, we apply

190 NFC normalization, so that letters followed by combining characters become

191 single combined characters.

192

193 Changing this to "NFKC" applies more compatibility conversions, such as

194 replacing the 'micro sign' with a standard Greek lowercase mu, which looks

195 identical. However, some NFKC normalizations change the meaning of text,

196 such as converting "10³" to "103".

197

198 `normalization` can be None, to apply no normalization.

199

200 - `max_decode_length`: 1_000_000

201

202 The maximum size of "segment" that ftfy will try to fix all at once.

203

204 - `explain`: True

205

206 Whether to compute 'explanations', lists describing what ftfy changed.

207 When this is False, the explanation will be None, and the code that

208 builds the explanation will be skipped, possibly saving time.

209

210 Functions that accept TextFixerConfig and don't return an explanation

211 will automatically set `explain` to False.

212 """

213 unescape_html: Union[str, bool] = "auto"

214 remove_terminal_escapes: bool = True

215 fix_encoding: bool = True

216 restore_byte_a0: bool = True

217 replace_lossy_sequences: bool = True

218 decode_inconsistent_utf8: bool = True

219 fix_c1_controls: bool = True

220 fix_latin_ligatures: bool = True

221 fix_character_width: bool = True

222 uncurl_quotes: bool = True

223 fix_line_breaks: bool = True

224 fix_surrogates: bool = True

225 remove_control_chars: bool = True

226 normalization: Optional[str] = "NFC"

227 max_decode_length: int = 1000000

228 explain: bool = True

229

230

231def _config_from_kwargs(

232 config: TextFixerConfig, kwargs: Dict[str, Any]

233) -> TextFixerConfig:

234 """

235 Handle parameters provided as keyword arguments to ftfy's top-level

236 functions, converting them into a TextFixerConfig.

237 """

238 if "fix_entities" in kwargs:

239 warnings.warn(

240 "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning

241 )

242 kwargs = kwargs.copy()

243 kwargs["unescape_html"] = kwargs["fix_entities"]

244 del kwargs["fix_entities"]

245 config = config._replace(**kwargs)

246 return config

247

248

249BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.

250

251ftfy is designed to fix problems with text. Treating bytes like they're

252interchangeable with Unicode text is usually something that introduces

253problems with text.

254

255You should first decode these bytes from the encoding you think they're in.

256If you're not sure what encoding they're in:

257

258- First, try to find out. 'utf-8' is a good assumption.

259- If the encoding is simply unknowable, try running your bytes through

260 ftfy.guess_bytes. As the name implies, this may not always be accurate.

261

262For more information on the distinction between bytes and text, read the

263Python Unicode HOWTO:

264

265 http://docs.python.org/3/howto/unicode.html

266"""

267

268

269def _try_fix(

270 fixer_name: str,

271 text: str,

272 config: TextFixerConfig,

273 steps: Optional[List[ExplanationStep]],

274) -> str:

275 """

276 A helper function used across several 'fixer' steps, deciding whether to

277 apply the fix and whether to record the fix in `steps`.

278 """

279 if getattr(config, fixer_name):

280 fixer = FIXERS[fixer_name]

281 fixed = fixer(text)

282 if steps is not None and fixed != text:

283 steps.append(ExplanationStep("apply", fixer_name))

284 return cast(str, fixed)

285

286 return text

287

288

289def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:

290 r"""

291 Given Unicode text as input, fix inconsistencies and glitches in it,

292 such as mojibake (text that was decoded in the wrong encoding).

293

294 Let's start with some examples:

295

296 >>> fix_text('âœ” No problems')

297 '✔ No problems'

298

299 >>> print(fix_text("¯\\_(ã\x83\x84)_/¯"))

300 ¯\_(ツ)_/¯

301

302 >>> fix_text('Broken text… it’s ﬂubberiﬁc!')

303 "Broken text... it's flubberific!"

304

305 >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')

306 'LOUD NOISES'

307

308 ftfy applies a number of different fixes to the text, and can accept

309 configuration to select which fixes to apply.

310

311 The configuration takes the form of a :class:`TextFixerConfig` object,

312 and you can see a description of the options in that class's docstring

313 or in the full documentation at ftfy.readthedocs.org.

314

315 For convenience and backward compatibility, the configuration can also

316 take the form of keyword arguments, which will set the equivalently-named

317 fields of the TextFixerConfig object.

318

319 For example, here are two ways to fix text but skip the "uncurl_quotes"

320 step::

321

322 fix_text(text, TextFixerConfig(uncurl_quotes=False))

323 fix_text(text, uncurl_quotes=False)

324

325 This function fixes text in independent segments, which are usually lines

326 of text, or arbitrarily broken up every 1 million codepoints (configurable

327 with `config.max_decode_length`) if there aren't enough line breaks. The

328 bound on segment lengths helps to avoid unbounded slowdowns.

329

330 ftfy can also provide an 'explanation', a list of transformations it applied

331 to the text that would fix more text like it. This function doesn't provide

332 explanations (because there may be different fixes for different segments

333 of text).

334

335 To get an explanation, use the :func:`fix_and_explain()` function, which

336 fixes the string in one segment and explains what it fixed.

337 """

338

339 if config is None:

340 config = TextFixerConfig(explain=False)

341 config = _config_from_kwargs(config, kwargs)

342 if isinstance(text, bytes):

343 raise UnicodeError(BYTES_ERROR_TEXT)

344

345 out = []

346 pos = 0

347 while pos < len(text):

348 textbreak = text.find("\n", pos) + 1

349 if textbreak == 0:

350 textbreak = len(text)

351 if (textbreak - pos) > config.max_decode_length:

352 textbreak = pos + config.max_decode_length

353

354 segment = text[pos:textbreak]

355 if config.unescape_html == "auto" and "<" in segment:

356 config = config._replace(unescape_html=False)

357 fixed_segment, _ = fix_and_explain(segment, config)

358 out.append(fixed_segment)

359 pos = textbreak

360 return "".join(out)

361

362

363def fix_and_explain(

364 text: str, config: Optional[TextFixerConfig] = None, **kwargs

365) -> ExplainedText:

366 """

367 Fix text as a single segment, returning the fixed text and an explanation

368 of what was fixed.

369

370 The explanation is a list of steps that can be applied with

371 :func:`apply_plan`, or if config.explain is False, it will be None.

372 """

373 if config is None:

374 config = TextFixerConfig()

375 if isinstance(text, bytes):

376 raise UnicodeError(BYTES_ERROR_TEXT)

377 config = _config_from_kwargs(config, kwargs)

378

379 if config.unescape_html == "auto" and "<" in text:

380 config = config._replace(unescape_html=False)

381

382 if config.explain:

383 steps: Optional[List[ExplanationStep]] = []

384 else:

385 # If explanations aren't desired, `steps` will be None

386 steps = None

387

388 while True:

389 origtext = text

390

391 text = _try_fix("unescape_html", text, config, steps)

392

393 if config.fix_encoding:

394 if steps is None:

395 text = fix_encoding(text)

396 else:

397 text, encoding_steps = fix_encoding_and_explain(text, config)

398 if encoding_steps is not None:

399 steps.extend(encoding_steps)

400

401 for fixer in [

402 "fix_c1_controls",

403 "fix_latin_ligatures",

404 "fix_character_width",

405 "uncurl_quotes",

406 "fix_line_breaks",

407 "fix_surrogates",

408 "remove_terminal_escapes",

409 "remove_control_chars",

410 ]:

411 text = _try_fix(fixer, text, config, steps)

412

413 if config.normalization is not None:

414 fixed = unicodedata.normalize(config.normalization, text)

415 if steps is not None and fixed != text:

416 steps.append(ExplanationStep("normalize", config.normalization))

417 text = fixed

418

419 if text == origtext:

420 return ExplainedText(text, steps)

421

422

423def fix_encoding_and_explain(

424 text: str, config: Optional[TextFixerConfig] = None, **kwargs

425) -> ExplainedText:

426 """

427 Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed

428 text and a list explaining what was fixed.

429

430 This includes fixing text by encoding and decoding it in different encodings,

431 as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,

432 `decode_inconsistent_utf8`, and `fix_c1_controls`.

433

434 Examples::

435

436 >>> fix_encoding_and_explain("sÃ³")

437 ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])

438

439 >>> result = fix_encoding_and_explain("voilÃ le travail")

440 >>> result.text

441 'voilà le travail'

442 >>> result.explanation

443 [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]

444

445 """

446 if config is None:

447 config = TextFixerConfig()

448 if isinstance(text, bytes):

449 raise UnicodeError(BYTES_ERROR_TEXT)

450 config = _config_from_kwargs(config, kwargs)

451

452 if not config.fix_encoding:

453 # A weird trivial case: we're asked to fix the encoding, but skip

454 # fixing the encoding

455 return ExplainedText(text, [])

456

457 plan_so_far: List[ExplanationStep] = []

458 while True:

459 prevtext = text

460 text, plan = _fix_encoding_one_step_and_explain(text, config)

461 if plan is not None:

462 plan_so_far.extend(plan)

463 if text == prevtext:

464 return ExplainedText(text, plan_so_far)

465

466

467def _fix_encoding_one_step_and_explain(

468 text: str, config: TextFixerConfig

469) -> ExplainedText:

470 """

471 Perform one step of fixing the encoding of text.

472 """

473 if config is None:

474 config = TextFixerConfig()

475

476 if len(text) == 0:

477 return ExplainedText(text, [])

478

479 # The first plan is to return ASCII text unchanged, as well as text

480 # that doesn't look like it contains mojibake

481 if chardata.possible_encoding(text, "ascii") or not is_bad(text):

482 return ExplainedText(text, [])

483

484 # As we go through the next step, remember the possible encodings

485 # that we encounter but don't successfully fix yet. We may need them

486 # later.

487 possible_1byte_encodings = []

488

489 # Suppose the text was supposed to be UTF-8, but it was decoded using

490 # a single-byte encoding instead. When these cases can be fixed, they

491 # are usually the correct thing to do, so try them next.

492 for encoding in chardata.CHARMAP_ENCODINGS:

493 if chardata.possible_encoding(text, encoding):

494 possible_1byte_encodings.append(encoding)

495 encoded_bytes = text.encode(encoding)

496 encode_step = ExplanationStep("encode", encoding)

497 transcode_steps = []

498

499 # Now, find out if it's UTF-8 (or close enough). Otherwise,

500 # remember the encoding for later.

501 try:

502 decoding = "utf-8"

503 # Check encoded_bytes for sequences that would be UTF-8,

504 # except they have b' ' where b'\xa0' would belong.

505 if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(

506 encoded_bytes

507 ):

508 replaced_bytes = fixes.restore_byte_a0(encoded_bytes)

509 if replaced_bytes != encoded_bytes:

510 transcode_steps.append(

511 ExplanationStep("transcode", "restore_byte_a0")

512 )

513 encoded_bytes = replaced_bytes

514

515 # Replace sequences where information has been lost

516 if config.replace_lossy_sequences and encoding.startswith("sloppy"):

517 replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)

518 if replaced_bytes != encoded_bytes:

519 transcode_steps.append(

520 ExplanationStep("transcode", "replace_lossy_sequences")

521 )

522 encoded_bytes = replaced_bytes

523

524 if 0xED in encoded_bytes or 0xC0 in encoded_bytes:

525 decoding = "utf-8-variants"

526

527 decode_step = ExplanationStep("decode", decoding)

528 steps = [encode_step] + transcode_steps + [decode_step]

529 fixed = encoded_bytes.decode(decoding)

530 return ExplainedText(fixed, steps)

531

532 except UnicodeDecodeError:

533 pass

534

535 # Look for a-hat-euro sequences that remain, and fix them in isolation.

536 if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):

537 steps = [ExplanationStep("apply", "decode_inconsistent_utf8")]

538 fixed = fixes.decode_inconsistent_utf8(text)

539 if fixed != text:

540 return ExplainedText(fixed, steps)

541

542 # The next most likely case is that this is Latin-1 that was intended to

543 # be read as Windows-1252, because those two encodings in particular are

544 # easily confused.

545 if "latin-1" in possible_1byte_encodings:

546 if "windows-1252" in possible_1byte_encodings:

547 # This text is in the intersection of Latin-1 and

548 # Windows-1252, so it's probably legit.

549 return ExplainedText(text, [])

550 else:

551 # Otherwise, it means we have characters that are in Latin-1 but

552 # not in Windows-1252. Those are C1 control characters. Nobody

553 # wants those. Assume they were meant to be Windows-1252.

554 try:

555 fixed = text.encode("latin-1").decode("windows-1252")

556 if fixed != text:

557 steps = [

558 ExplanationStep("encode", "latin-1"),

559 ExplanationStep("decode", "windows-1252"),

560 ]

561 return ExplainedText(fixed, steps)

562 except UnicodeDecodeError:

563 pass

564

565 # Fix individual characters of Latin-1 with a less satisfying explanation

566 if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):

567 steps = [ExplanationStep("transcode", "fix_c1_controls")]

568 fixed = fixes.fix_c1_controls(text)

569 return ExplainedText(fixed, steps)

570

571 # The cases that remain are mixups between two different single-byte

572 # encodings, and not the common case of Latin-1 vs. Windows-1252.

573 #

574 # With the new heuristic in 6.0, it's possible that we're closer to solving

575 # these in some cases. It would require a lot of testing and tuning, though.

576 # For now, we leave the text unchanged in these cases.

577 return ExplainedText(text, [])

578

579

580def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs):

581 """

582 Apply just the encoding-fixing steps of ftfy to this text. Returns the

583 fixed text, discarding the explanation.

584

585 >>> fix_encoding("Ã³")

586 'ó'

587 >>> fix_encoding("&ATILDE;&SUP3;")

588 '&ATILDE;&SUP3;'

589 """

590 if config is None:

591 config = TextFixerConfig(explain=False)

592 config = _config_from_kwargs(config, kwargs)

593 fixed, _explan = fix_encoding_and_explain(text, config)

594 return fixed

595

596

597# Some alternate names for the main functions

598ftfy = fix_text

599

600

601def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs):

602 """

603 Fix text as a single segment, with a consistent sequence of steps that

604 are applied to fix the text. Discard the explanation.

605 """

606 if config is None:

607 config = TextFixerConfig(explain=False)

608 config = _config_from_kwargs(config, kwargs)

609 fixed, _explan = fix_and_explain(text, config)

610 return fixed

611

612

613def fix_file(

614 input_file: TextIO,

615 encoding: Optional[str] = None,

616 config: Optional[TextFixerConfig] = None,

617 **kwargs

618) -> Iterator[str]:

619 """

620 Fix text that is found in a file.

621

622 If the file is being read as Unicode text, use that. If it's being read as

623 bytes, then we hope an encoding was supplied. If not, unfortunately, we

624 have to guess what encoding it is. We'll try a few common encodings, but we

625 make no promises. See the `guess_bytes` function for how this is done.

626

627 The output is a stream of fixed lines of text.

628 """

629 if config is None:

630 config = TextFixerConfig()

631 config = _config_from_kwargs(config, kwargs)

632

633 for line in input_file:

634 if isinstance(line, bytes):

635 if encoding is None:

636 line, encoding = guess_bytes(line)

637 else:

638 line = line.decode(encoding)

639 if config.unescape_html == "auto" and "<" in line:

640 config = config._replace(unescape_html=False)

641

642 fixed_line, _explan = fix_and_explain(line, config)

643 yield fixed_line

644

645

646def guess_bytes(bstring: bytes) -> Tuple[str, str]:

647 """

648 NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy

649 is not designed to be an encoding detector.

650

651 In the unfortunate situation that you have some bytes in an unknown

652 encoding, ftfy can guess a reasonable strategy for decoding them, by trying

653 a few common encodings that can be distinguished from each other.

654

655 Unlike the rest of ftfy, this may not be accurate, and it may *create*

656 Unicode problems instead of solving them!

657

658 The encodings we try here are:

659

660 - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks

661 like nothing else

662 - UTF-8, because it's the global standard, which has been used by a

663 majority of the Web since 2008

664 - "utf-8-variants", or buggy implementations of UTF-8

665 - MacRoman, because Microsoft Office thinks it's still a thing, and it

666 can be distinguished by its line breaks. (If there are no line breaks in

667 the string, though, you're out of luck.)

668 - "sloppy-windows-1252", the Latin-1-like encoding that is the most common

669 single-byte encoding.

670 """

671 if isinstance(bstring, str):

672 raise UnicodeError(

673 "This string was already decoded as Unicode. You should pass "

674 "bytes to guess_bytes, not Unicode."

675 )

676

677 if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):

678 return bstring.decode("utf-16"), "utf-16"

679

680 byteset = set(bstring)

681 try:

682 if 0xED in byteset or 0xC0 in byteset:

683 # Byte 0xed can be used to encode a range of codepoints that

684 # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,

685 # so when we see 0xed, it's very likely we're being asked to

686 # decode CESU-8, the variant that encodes UTF-16 surrogates

687 # instead of the original characters themselves.

688 #

689 # This will occasionally trigger on standard UTF-8, as there

690 # are some Korean characters that also use byte 0xed, but that's

691 # not harmful because standard UTF-8 characters will decode the

692 # same way in our 'utf-8-variants' codec.

693 #

694 # Byte 0xc0 is impossible because, numerically, it would only

695 # encode characters lower than U+0040. Those already have

696 # single-byte representations, and UTF-8 requires using the

697 # shortest possible representation. However, Java hides the null

698 # codepoint, U+0000, in a non-standard longer representation -- it

699 # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00

700 # will never appear in the encoded bytes.

701 #

702 # The 'utf-8-variants' decoder can handle both of these cases, as

703 # well as standard UTF-8, at the cost of a bit of speed.

704 return bstring.decode("utf-8-variants"), "utf-8-variants"

705 else:

706 return bstring.decode("utf-8"), "utf-8"

707 except UnicodeDecodeError:

708 pass

709

710 if 0x0D in byteset and 0x0A not in byteset:

711 # Files that contain CR and not LF are likely to be MacRoman.

712 return bstring.decode("macroman"), "macroman"

713

714 return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"

715

716

717@no_type_check

718def apply_plan(text: str, plan: List[Tuple[str, str]]):

719 """

720 Apply a plan for fixing the encoding of text.

721

722 The plan is a list of tuples of the form (operation, arg).

723

724 `operation` is one of:

725

726 - `'encode'`: convert a string to bytes, using `arg` as the encoding

727 - `'decode'`: convert bytes to a string, using `arg` as the encoding

728 - `'transcode'`: convert bytes to bytes, using the function named `arg`

729 - `'apply'`: convert a string to a string, using the function named `arg`

730

731 The functions that can be applied by 'transcode' and 'apply' are

732 specifically those that appear in the dictionary named `FIXERS`. They

733 can also can be imported from the `ftfy.fixes` module.

734

735 Example::

736

737 >>> mojibake = "schÃ¶n"

738 >>> text, plan = fix_and_explain(mojibake)

739 >>> apply_plan(mojibake, plan)

740 'schön'

741 """

742 obj = text

743 for operation, encoding in plan:

744 if operation == "encode":

745 obj = obj.encode(encoding)

746 elif operation == "decode":

747 obj = obj.decode(encoding)

748 elif operation in ("transcode", "apply"):

749 if encoding in FIXERS:

750 obj = FIXERS[encoding](obj)

751 else:

752 raise ValueError("Unknown function to apply: %s" % encoding)

753 else:

754 raise ValueError("Unknown plan step: %s" % operation)

755

756 return obj

757

758

759def explain_unicode(text: str):

760 """

761 A utility method that's useful for debugging mysterious Unicode.

762

763 It breaks down a string, showing you for each codepoint its number in

764 hexadecimal, its glyph, its category in the Unicode standard, and its name

765 in the Unicode standard.

766

767 >>> explain_unicode('(╯°□°)╯︵ ┻━┻')

768 U+0028 ( [Ps] LEFT PARENTHESIS

769 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT

770 U+00B0 ° [So] DEGREE SIGN

771 U+25A1 □ [So] WHITE SQUARE

772 U+00B0 ° [So] DEGREE SIGN

773 U+0029 ) [Pe] RIGHT PARENTHESIS

774 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT

775 U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS

776 U+0020 [Zs] SPACE

777 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL

778 U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL

779 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL

780 """

781 for char in text:

782 if char.isprintable():

783 display = char

784 else:

785 display = char.encode("unicode-escape").decode("ascii")

786 print(

787 "U+{code:04X} {display} [{category}] {name}".format(

788 display=display_ljust(display, 7),

789 code=ord(char),

790 category=unicodedata.category(char),

791 name=unicodedata.name(char, "<unknown>"),

792 )

793 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/init.py: 85%

231 statements