Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/__init_

1"""

2ftfy: fixes text for you

4This is a module for making text less broken. See the `fix_text` function

5for more information.

6"""

7from __future__ import annotations

8import unicodedata

9import warnings

10from typing import (

11 Any,

12 BinaryIO,

13 Dict,

14 Iterator,

15 List,

16 NamedTuple,

17 Optional,

18 TextIO,

19 Tuple,

20 Union,

21 cast,

22 no_type_check,

23)

25from ftfy import bad_codecs

26from ftfy import chardata, fixes

27from ftfy.badness import is_bad

28from ftfy.formatting import display_ljust

30__version__ = "6.1.2"

33# Though this function does nothing, it lets linters know that we're using

34# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.

35bad_codecs.ok()

38class ExplanationStep(NamedTuple):

39 """

40 A step in an ExplainedText, explaining how to decode text.

42 The possible actions are:

44 - "encode": take in a string and encode it as bytes, with the given encoding

45 - "decode": take in bytes and decode them as a string, with the given encoding

46 - "transcode": convert bytes to bytes with a particular named function

47 - "apply": convert str to str with a particular named function

49 The `parameter` is the name of the encoding or function to use. If it's a

50 function, it must appear in the FIXERS dictionary.

51 """

53 action: str

54 parameter: str

56 def __repr__(self) -> str:

57 """

58 Get the string representation of an ExplanationStep. We output the

59 representation of the equivalent tuple, for simplicity.

60 """

61 return repr(tuple(self))

64class ExplainedText(NamedTuple):

65 """

66 The return type from ftfy's functions that provide an "explanation" of which

67 steps it applied to fix the text, such as :func:`fix_and_explain()`.

69 When the 'explain' option is disabled, these functions return the same

70 type, but the `explanation` will be None.

71 """

73 text: str

74 explanation: Optional[List[ExplanationStep]]

77# Functions that can be applied using `apply_plan`.

78FIXERS = {

79 "unescape_html": fixes.unescape_html,

80 "remove_terminal_escapes": fixes.remove_terminal_escapes,

81 "restore_byte_a0": fixes.restore_byte_a0,

82 "replace_lossy_sequences": fixes.replace_lossy_sequences,

83 "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,

84 "fix_c1_controls": fixes.fix_c1_controls,

85 "fix_latin_ligatures": fixes.fix_latin_ligatures,

86 "fix_character_width": fixes.fix_character_width,

87 "uncurl_quotes": fixes.uncurl_quotes,

88 "fix_line_breaks": fixes.fix_line_breaks,

89 "fix_surrogates": fixes.fix_surrogates,

90 "remove_control_chars": fixes.remove_control_chars,

91}

94class TextFixerConfig(NamedTuple):

95 r"""

96 A TextFixerConfig object stores configuration options for ftfy.

98 It's implemented as a namedtuple with defaults, so you can instantiate

99 it by providing the values to change from their defaults as keyword arguments.

100 For example, to disable 'unescape_html' and keep the rest of the defaults::

101

102 TextFixerConfig(unescape_html=False)

103

104 Here are the options and their default values:

105

106 - `unescape_html`: "auto"

107

108 Configures whether to replace HTML entities such as & with the character

109 they represent. "auto" says to do this by default, but disable it when a

110 literal < character appears, indicating that the input is actual HTML and

111 entities should be preserved. The value can be True, to always enable this

112 fixer, or False, to always disable it.

113

114 - `remove_terminal_escapes`: True

115

116 Removes "ANSI" terminal escapes, such as for changing the color of text in a

117 terminal window.

118

119 - `fix_encoding`: True

120

121 Detect mojibake and attempt to fix it by decoding the text in a different

122 encoding standard.

123

124 The following four options affect `fix_encoding` works, and do nothing if

125 `fix_encoding` is False:

126

127 - `restore_byte_a0`: True

128

129 Allow a literal space (U+20) to be interpreted as a non-breaking space

130 (U+A0) when that would make it part of a fixable mojibake string.

131

132 Because spaces are very common characters, this could lead to false

133 positives, but we try to apply it only when there's strong evidence for

134 mojibake. Disabling `restore_byte_a0` is safer from false positives,

135 but creates false negatives.

136

137 - `replace_lossy_sequences`: True

138

139 Detect mojibake that has been partially replaced by the characters

140 '�' or '?'. If the mojibake could be decoded otherwise, replace the

141 detected sequence with '�'.

142

143 - `decode_inconsistent_utf8`: True

144

145 When we see sequences that distinctly look like UTF-8 mojibake, but

146 there's no consistent way to reinterpret the string in a new encoding,

147 replace the mojibake with the appropriate UTF-8 characters anyway.

148

149 This helps to decode strings that are concatenated from different

150 encodings.

151

152 - `fix_c1_controls`: True

153

154 Replace C1 control characters (the useless characters U+80 - U+9B that

155 come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,

156 even if the whole string doesn't decode as Latin-1.

157

158 - `fix_latin_ligatures`: True

159

160 Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the

161 letters they're made of.

162

163 - `fix_character_width`: True

164

165 Replace fullwidth Latin characters and halfwidth Katakana with

166 their more standard widths.

167

168 - `uncurl_quotes`: True

169

170 Replace curly quotes with straight quotes.

171

172 - `fix_line_breaks`: True

173

174 Replace various forms of line breaks with the standard Unix line

175 break, ``\n``.

176

177 - `fix_surrogates`: True

178

179 Replace sequences of UTF-16 surrogate codepoints with the character

180 they were meant to encode. This fixes text that was decoded with the

181 obsolete UCS-2 standard, and allows it to support high-numbered

182 codepoints such as emoji.

183

184 - `remove_control_chars`: True

185

186 Remove certain control characters that have no displayed effect on text.

187

188 - `normalization`: "NFC"

189

190 Choose what kind of Unicode normalization is applied. Usually, we apply

191 NFC normalization, so that letters followed by combining characters become

192 single combined characters.

193

194 Changing this to "NFKC" applies more compatibility conversions, such as

195 replacing the 'micro sign' with a standard Greek lowercase mu, which looks

196 identical. However, some NFKC normalizations change the meaning of text,

197 such as converting "10³" to "103".

198

199 `normalization` can be None, to apply no normalization.

200

201 - `max_decode_length`: 1_000_000

202

203 The maximum size of "segment" that ftfy will try to fix all at once.

204

205 - `explain`: True

206

207 Whether to compute 'explanations', lists describing what ftfy changed.

208 When this is False, the explanation will be None, and the code that

209 builds the explanation will be skipped, possibly saving time.

210

211 Functions that accept TextFixerConfig and don't return an explanation

212 will automatically set `explain` to False.

213 """

214 unescape_html: Union[str, bool] = "auto"

215 remove_terminal_escapes: bool = True

216 fix_encoding: bool = True

217 restore_byte_a0: bool = True

218 replace_lossy_sequences: bool = True

219 decode_inconsistent_utf8: bool = True

220 fix_c1_controls: bool = True

221 fix_latin_ligatures: bool = True

222 fix_character_width: bool = True

223 uncurl_quotes: bool = True

224 fix_line_breaks: bool = True

225 fix_surrogates: bool = True

226 remove_control_chars: bool = True

227 normalization: Optional[str] = "NFC"

228 max_decode_length: int = 1000000

229 explain: bool = True

230

231

232def _config_from_kwargs(

233 config: TextFixerConfig, kwargs: Dict[str, Any]

234) -> TextFixerConfig:

235 """

236 Handle parameters provided as keyword arguments to ftfy's top-level

237 functions, converting them into a TextFixerConfig.

238 """

239 if "fix_entities" in kwargs:

240 warnings.warn(

241 "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning

242 )

243 kwargs = kwargs.copy()

244 kwargs["unescape_html"] = kwargs["fix_entities"]

245 del kwargs["fix_entities"]

246 config = config._replace(**kwargs)

247 return config

248

249

250BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.

251

252ftfy is designed to fix problems with text. Treating bytes like they're

253interchangeable with Unicode text is usually something that introduces

254problems with text.

255

256You should first decode these bytes from the encoding you think they're in.

257If you're not sure what encoding they're in:

258

259- First, try to find out. 'utf-8' is a good assumption.

260- If the encoding is simply unknowable, try running your bytes through

261 ftfy.guess_bytes. As the name implies, this may not always be accurate.

262

263For more information on the distinction between bytes and text, read the

264Python Unicode HOWTO:

265

266 http://docs.python.org/3/howto/unicode.html

267"""

268

269

270def _try_fix(

271 fixer_name: str,

272 text: str,

273 config: TextFixerConfig,

274 steps: Optional[List[ExplanationStep]],

275) -> str:

276 """

277 A helper function used across several 'fixer' steps, deciding whether to

278 apply the fix and whether to record the fix in `steps`.

279 """

280 if getattr(config, fixer_name):

281 fixer = FIXERS[fixer_name]

282 fixed = fixer(text)

283 if steps is not None and fixed != text:

284 steps.append(ExplanationStep("apply", fixer_name))

285 return cast(str, fixed)

286

287 return text

288

289

290def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:

291 r"""

292 Given Unicode text as input, fix inconsistencies and glitches in it,

293 such as mojibake (text that was decoded in the wrong encoding).

294

295 Let's start with some examples:

296

297 >>> fix_text('âœ” No problems')

298 '✔ No problems'

299

300 >>> print(fix_text("¯\\_(ã\x83\x84)_/¯"))

301 ¯\_(ツ)_/¯

302

303 >>> fix_text('Broken text… it’s ﬂubberiﬁc!')

304 "Broken text... it's flubberific!"

305

306 >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')

307 'LOUD NOISES'

308

309 ftfy applies a number of different fixes to the text, and can accept

310 configuration to select which fixes to apply.

311

312 The configuration takes the form of a :class:`TextFixerConfig` object,

313 and you can see a description of the options in that class's docstring

314 or in the full documentation at ftfy.readthedocs.org.

315

316 For convenience and backward compatibility, the configuration can also

317 take the form of keyword arguments, which will set the equivalently-named

318 fields of the TextFixerConfig object.

319

320 For example, here are two ways to fix text but skip the "uncurl_quotes"

321 step::

322

323 fix_text(text, TextFixerConfig(uncurl_quotes=False))

324 fix_text(text, uncurl_quotes=False)

325

326 This function fixes text in independent segments, which are usually lines

327 of text, or arbitrarily broken up every 1 million codepoints (configurable

328 with `config.max_decode_length`) if there aren't enough line breaks. The

329 bound on segment lengths helps to avoid unbounded slowdowns.

330

331 ftfy can also provide an 'explanation', a list of transformations it applied

332 to the text that would fix more text like it. This function doesn't provide

333 explanations (because there may be different fixes for different segments

334 of text).

335

336 To get an explanation, use the :func:`fix_and_explain()` function, which

337 fixes the string in one segment and explains what it fixed.

338 """

339

340 if config is None:

341 config = TextFixerConfig(explain=False)

342 config = _config_from_kwargs(config, kwargs)

343 if isinstance(text, bytes):

344 raise UnicodeError(BYTES_ERROR_TEXT)

345

346 out = []

347 pos = 0

348 while pos < len(text):

349 textbreak = text.find("\n", pos) + 1

350 if textbreak == 0:

351 textbreak = len(text)

352 if (textbreak - pos) > config.max_decode_length:

353 textbreak = pos + config.max_decode_length

354

355 segment = text[pos:textbreak]

356 if config.unescape_html == "auto" and "<" in segment:

357 config = config._replace(unescape_html=False)

358 fixed_segment, _ = fix_and_explain(segment, config)

359 out.append(fixed_segment)

360 pos = textbreak

361 return "".join(out)

362

363

364def fix_and_explain(

365 text: str, config: Optional[TextFixerConfig] = None, **kwargs

366) -> ExplainedText:

367 """

368 Fix text as a single segment, returning the fixed text and an explanation

369 of what was fixed.

370

371 The explanation is a list of steps that can be applied with

372 :func:`apply_plan`, or if config.explain is False, it will be None.

373 """

374 if config is None:

375 config = TextFixerConfig()

376 if isinstance(text, bytes):

377 raise UnicodeError(BYTES_ERROR_TEXT)

378 config = _config_from_kwargs(config, kwargs)

379

380 if config.unescape_html == "auto" and "<" in text:

381 config = config._replace(unescape_html=False)

382

383 if config.explain:

384 steps: Optional[List[ExplanationStep]] = []

385 else:

386 # If explanations aren't desired, `steps` will be None

387 steps = None

388

389 while True:

390 origtext = text

391

392 text = _try_fix("unescape_html", text, config, steps)

393

394 if config.fix_encoding:

395 if steps is None:

396 text = fix_encoding(text)

397 else:

398 text, encoding_steps = fix_encoding_and_explain(text, config)

399 if encoding_steps is not None:

400 steps.extend(encoding_steps)

401

402 for fixer in [

403 "fix_c1_controls",

404 "fix_latin_ligatures",

405 "fix_character_width",

406 "uncurl_quotes",

407 "fix_line_breaks",

408 "fix_surrogates",

409 "remove_terminal_escapes",

410 "remove_control_chars",

411 ]:

412 text = _try_fix(fixer, text, config, steps)

413

414 if config.normalization is not None:

415 fixed = unicodedata.normalize(config.normalization, text)

416 if steps is not None and fixed != text:

417 steps.append(ExplanationStep("normalize", config.normalization))

418 text = fixed

419

420 if text == origtext:

421 return ExplainedText(text, steps)

422

423

424def fix_encoding_and_explain(

425 text: str, config: Optional[TextFixerConfig] = None, **kwargs

426) -> ExplainedText:

427 """

428 Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed

429 text and a list explaining what was fixed.

430

431 This includes fixing text by encoding and decoding it in different encodings,

432 as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,

433 `decode_inconsistent_utf8`, and `fix_c1_controls`.

434

435 Examples::

436

437 >>> fix_encoding_and_explain("sÃ³")

438 ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])

439

440 >>> result = fix_encoding_and_explain("voilÃ le travail")

441 >>> result.text

442 'voilà le travail'

443 >>> result.explanation

444 [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]

445

446 """

447 if config is None:

448 config = TextFixerConfig()

449 if isinstance(text, bytes):

450 raise UnicodeError(BYTES_ERROR_TEXT)

451 config = _config_from_kwargs(config, kwargs)

452

453 if not config.fix_encoding:

454 # A weird trivial case: we're asked to fix the encoding, but skip

455 # fixing the encoding

456 return ExplainedText(text, [])

457

458 plan_so_far: List[ExplanationStep] = []

459 while True:

460 prevtext = text

461 text, plan = _fix_encoding_one_step_and_explain(text, config)

462 if plan is not None:

463 plan_so_far.extend(plan)

464 if text == prevtext:

465 return ExplainedText(text, plan_so_far)

466

467

468def _fix_encoding_one_step_and_explain(

469 text: str, config: TextFixerConfig

470) -> ExplainedText:

471 """

472 Perform one step of fixing the encoding of text.

473 """

474 if config is None:

475 config = TextFixerConfig()

476

477 if len(text) == 0:

478 return ExplainedText(text, [])

479

480 # The first plan is to return ASCII text unchanged, as well as text

481 # that doesn't look like it contains mojibake

482 if chardata.possible_encoding(text, "ascii") or not is_bad(text):

483 return ExplainedText(text, [])

484

485 # As we go through the next step, remember the possible encodings

486 # that we encounter but don't successfully fix yet. We may need them

487 # later.

488 possible_1byte_encodings = []

489

490 # Suppose the text was supposed to be UTF-8, but it was decoded using

491 # a single-byte encoding instead. When these cases can be fixed, they

492 # are usually the correct thing to do, so try them next.

493 for encoding in chardata.CHARMAP_ENCODINGS:

494 if chardata.possible_encoding(text, encoding):

495 possible_1byte_encodings.append(encoding)

496 encoded_bytes = text.encode(encoding)

497 encode_step = ExplanationStep("encode", encoding)

498 transcode_steps = []

499

500 # Now, find out if it's UTF-8 (or close enough). Otherwise,

501 # remember the encoding for later.

502 try:

503 decoding = "utf-8"

504 # Check encoded_bytes for sequences that would be UTF-8,

505 # except they have b' ' where b'\xa0' would belong.

506 if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(

507 encoded_bytes

508 ):

509 replaced_bytes = fixes.restore_byte_a0(encoded_bytes)

510 if replaced_bytes != encoded_bytes:

511 transcode_steps.append(

512 ExplanationStep("transcode", "restore_byte_a0")

513 )

514 encoded_bytes = replaced_bytes

515

516 # Replace sequences where information has been lost

517 if config.replace_lossy_sequences and encoding.startswith("sloppy"):

518 replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)

519 if replaced_bytes != encoded_bytes:

520 transcode_steps.append(

521 ExplanationStep("transcode", "replace_lossy_sequences")

522 )

523 encoded_bytes = replaced_bytes

524

525 if 0xED in encoded_bytes or 0xC0 in encoded_bytes:

526 decoding = "utf-8-variants"

527

528 decode_step = ExplanationStep("decode", decoding)

529 steps = [encode_step] + transcode_steps + [decode_step]

530 fixed = encoded_bytes.decode(decoding)

531 return ExplainedText(fixed, steps)

532

533 except UnicodeDecodeError:

534 pass

535

536 # Look for a-hat-euro sequences that remain, and fix them in isolation.

537 if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):

538 steps = [ExplanationStep("apply", "decode_inconsistent_utf8")]

539 fixed = fixes.decode_inconsistent_utf8(text)

540 if fixed != text:

541 return ExplainedText(fixed, steps)

542

543 # The next most likely case is that this is Latin-1 that was intended to

544 # be read as Windows-1252, because those two encodings in particular are

545 # easily confused.

546 if "latin-1" in possible_1byte_encodings:

547 if "windows-1252" in possible_1byte_encodings:

548 # This text is in the intersection of Latin-1 and

549 # Windows-1252, so it's probably legit.

550 return ExplainedText(text, [])

551 else:

552 # Otherwise, it means we have characters that are in Latin-1 but

553 # not in Windows-1252. Those are C1 control characters. Nobody

554 # wants those. Assume they were meant to be Windows-1252.

555 try:

556 fixed = text.encode("latin-1").decode("windows-1252")

557 if fixed != text:

558 steps = [

559 ExplanationStep("encode", "latin-1"),

560 ExplanationStep("decode", "windows-1252"),

561 ]

562 return ExplainedText(fixed, steps)

563 except UnicodeDecodeError:

564 pass

565

566 # Fix individual characters of Latin-1 with a less satisfying explanation

567 if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):

568 steps = [ExplanationStep("transcode", "fix_c1_controls")]

569 fixed = fixes.fix_c1_controls(text)

570 return ExplainedText(fixed, steps)

571

572 # The cases that remain are mixups between two different single-byte

573 # encodings, and not the common case of Latin-1 vs. Windows-1252.

574 #

575 # With the new heuristic in 6.0, it's possible that we're closer to solving

576 # these in some cases. It would require a lot of testing and tuning, though.

577 # For now, we leave the text unchanged in these cases.

578 return ExplainedText(text, [])

579

580

581def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs):

582 """

583 Apply just the encoding-fixing steps of ftfy to this text. Returns the

584 fixed text, discarding the explanation.

585

586 >>> fix_encoding("Ã³")

587 'ó'

588 >>> fix_encoding("&ATILDE;&SUP3;")

589 '&ATILDE;&SUP3;'

590 """

591 if config is None:

592 config = TextFixerConfig(explain=False)

593 config = _config_from_kwargs(config, kwargs)

594 fixed, _explan = fix_encoding_and_explain(text, config)

595 return fixed

596

597

598# Some alternate names for the main functions

599ftfy = fix_text

600

601

602def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs):

603 """

604 Fix text as a single segment, with a consistent sequence of steps that

605 are applied to fix the text. Discard the explanation.

606 """

607 if config is None:

608 config = TextFixerConfig(explain=False)

609 config = _config_from_kwargs(config, kwargs)

610 fixed, _explan = fix_and_explain(text, config)

611 return fixed

612

613

614def fix_file(

615 input_file: TextIO | BinaryIO,

616 encoding: Optional[str] = None,

617 config: Optional[TextFixerConfig] = None,

618 **kwargs,

619) -> Iterator[str]:

620 """

621 Fix text that is found in a file.

622

623 If the file is being read as Unicode text, use that. If it's being read as

624 bytes, then we hope an encoding was supplied. If not, unfortunately, we

625 have to guess what encoding it is. We'll try a few common encodings, but we

626 make no promises. See the `guess_bytes` function for how this is done.

627

628 The output is a stream of fixed lines of text.

629 """

630 if config is None:

631 config = TextFixerConfig()

632 config = _config_from_kwargs(config, kwargs)

633

634 for line in input_file:

635 if isinstance(line, bytes):

636 if encoding is None:

637 line, encoding = guess_bytes(line)

638 else:

639 line = line.decode(encoding)

640 if config.unescape_html == "auto" and "<" in line:

641 config = config._replace(unescape_html=False)

642

643 fixed_line, _explan = fix_and_explain(line, config)

644 yield fixed_line

645

646

647def guess_bytes(bstring: bytes) -> Tuple[str, str]:

648 """

649 NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy

650 is not designed to be an encoding detector.

651

652 In the unfortunate situation that you have some bytes in an unknown

653 encoding, ftfy can guess a reasonable strategy for decoding them, by trying

654 a few common encodings that can be distinguished from each other.

655

656 Unlike the rest of ftfy, this may not be accurate, and it may *create*

657 Unicode problems instead of solving them!

658

659 The encodings we try here are:

660

661 - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks

662 like nothing else

663 - UTF-8, because it's the global standard, which has been used by a

664 majority of the Web since 2008

665 - "utf-8-variants", or buggy implementations of UTF-8

666 - MacRoman, because Microsoft Office thinks it's still a thing, and it

667 can be distinguished by its line breaks. (If there are no line breaks in

668 the string, though, you're out of luck.)

669 - "sloppy-windows-1252", the Latin-1-like encoding that is the most common

670 single-byte encoding.

671 """

672 if isinstance(bstring, str):

673 raise UnicodeError(

674 "This string was already decoded as Unicode. You should pass "

675 "bytes to guess_bytes, not Unicode."

676 )

677

678 if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):

679 return bstring.decode("utf-16"), "utf-16"

680

681 byteset = set(bstring)

682 try:

683 if 0xED in byteset or 0xC0 in byteset:

684 # Byte 0xed can be used to encode a range of codepoints that

685 # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,

686 # so when we see 0xed, it's very likely we're being asked to

687 # decode CESU-8, the variant that encodes UTF-16 surrogates

688 # instead of the original characters themselves.

689 #

690 # This will occasionally trigger on standard UTF-8, as there

691 # are some Korean characters that also use byte 0xed, but that's

692 # not harmful because standard UTF-8 characters will decode the

693 # same way in our 'utf-8-variants' codec.

694 #

695 # Byte 0xc0 is impossible because, numerically, it would only

696 # encode characters lower than U+0040. Those already have

697 # single-byte representations, and UTF-8 requires using the

698 # shortest possible representation. However, Java hides the null

699 # codepoint, U+0000, in a non-standard longer representation -- it

700 # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00

701 # will never appear in the encoded bytes.

702 #

703 # The 'utf-8-variants' decoder can handle both of these cases, as

704 # well as standard UTF-8, at the cost of a bit of speed.

705 return bstring.decode("utf-8-variants"), "utf-8-variants"

706 else:

707 return bstring.decode("utf-8"), "utf-8"

708 except UnicodeDecodeError:

709 pass

710

711 if 0x0D in byteset and 0x0A not in byteset:

712 # Files that contain CR and not LF are likely to be MacRoman.

713 return bstring.decode("macroman"), "macroman"

714

715 return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"

716

717

718@no_type_check

719def apply_plan(text: str, plan: List[Tuple[str, str]]):

720 """

721 Apply a plan for fixing the encoding of text.

722

723 The plan is a list of tuples of the form (operation, arg).

724

725 `operation` is one of:

726

727 - `'encode'`: convert a string to bytes, using `arg` as the encoding

728 - `'decode'`: convert bytes to a string, using `arg` as the encoding

729 - `'transcode'`: convert bytes to bytes, using the function named `arg`

730 - `'apply'`: convert a string to a string, using the function named `arg`

731

732 The functions that can be applied by 'transcode' and 'apply' are

733 specifically those that appear in the dictionary named `FIXERS`. They

734 can also can be imported from the `ftfy.fixes` module.

735

736 Example::

737

738 >>> mojibake = "schÃ¶n"

739 >>> text, plan = fix_and_explain(mojibake)

740 >>> apply_plan(mojibake, plan)

741 'schön'

742 """

743 obj = text

744 for operation, encoding in plan:

745 if operation == "encode":

746 obj = obj.encode(encoding)

747 elif operation == "decode":

748 obj = obj.decode(encoding)

749 elif operation in ("transcode", "apply"):

750 if encoding in FIXERS:

751 obj = FIXERS[encoding](obj)

752 else:

753 raise ValueError("Unknown function to apply: %s" % encoding)

754 else:

755 raise ValueError("Unknown plan step: %s" % operation)

756

757 return obj

758

759

760def explain_unicode(text: str):

761 """

762 A utility method that's useful for debugging mysterious Unicode.

763

764 It breaks down a string, showing you for each codepoint its number in

765 hexadecimal, its glyph, its category in the Unicode standard, and its name

766 in the Unicode standard.

767

768 >>> explain_unicode('(╯°□°)╯︵ ┻━┻')

769 U+0028 ( [Ps] LEFT PARENTHESIS

770 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT

771 U+00B0 ° [So] DEGREE SIGN

772 U+25A1 □ [So] WHITE SQUARE

773 U+00B0 ° [So] DEGREE SIGN

774 U+0029 ) [Pe] RIGHT PARENTHESIS

775 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT

776 U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS

777 U+0020 [Zs] SPACE

778 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL

779 U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL

780 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL

781 """

782 for char in text:

783 if char.isprintable():

784 display = char

785 else:

786 display = char.encode("unicode-escape").decode("ascii")

787 print(

788 "U+{code:04X} {display} [{category}] {name}".format(

789 display=display_ljust(display, 7),

790 code=ord(char),

791 category=unicodedata.category(char),

792 name=unicodedata.name(char, "<unknown>"),

793 )

794 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/init.py: 85%

232 statements