Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/__init__.py: 85%
231 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:40 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:40 +0000
1"""
2ftfy: fixes text for you
4This is a module for making text less broken. See the `fix_text` function
5for more information.
6"""
8import unicodedata
9import warnings
10from typing import (
11 Any,
12 Dict,
13 Iterator,
14 List,
15 NamedTuple,
16 Optional,
17 TextIO,
18 Tuple,
19 Union,
20 cast,
21 no_type_check,
22)
24from ftfy import bad_codecs
25from ftfy import chardata, fixes
26from ftfy.badness import is_bad
27from ftfy.formatting import display_ljust
29__version__ = "6.1.2"
32# Though this function does nothing, it lets linters know that we're using
33# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
34bad_codecs.ok()
37class ExplanationStep(NamedTuple):
38 """
39 A step in an ExplainedText, explaining how to decode text.
41 The possible actions are:
43 - "encode": take in a string and encode it as bytes, with the given encoding
44 - "decode": take in bytes and decode them as a string, with the given encoding
45 - "transcode": convert bytes to bytes with a particular named function
46 - "apply": convert str to str with a particular named function
48 The `parameter` is the name of the encoding or function to use. If it's a
49 function, it must appear in the FIXERS dictionary.
50 """
52 action: str
53 parameter: str
55 def __repr__(self) -> str:
56 """
57 Get the string representation of an ExplanationStep. We output the
58 representation of the equivalent tuple, for simplicity.
59 """
60 return repr(tuple(self))
63class ExplainedText(NamedTuple):
64 """
65 The return type from ftfy's functions that provide an "explanation" of which
66 steps it applied to fix the text, such as :func:`fix_and_explain()`.
68 When the 'explain' option is disabled, these functions return the same
69 type, but the `explanation` will be None.
70 """
72 text: str
73 explanation: Optional[List[ExplanationStep]]
76# Functions that can be applied using `apply_plan`.
77FIXERS = {
78 "unescape_html": fixes.unescape_html,
79 "remove_terminal_escapes": fixes.remove_terminal_escapes,
80 "restore_byte_a0": fixes.restore_byte_a0,
81 "replace_lossy_sequences": fixes.replace_lossy_sequences,
82 "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
83 "fix_c1_controls": fixes.fix_c1_controls,
84 "fix_latin_ligatures": fixes.fix_latin_ligatures,
85 "fix_character_width": fixes.fix_character_width,
86 "uncurl_quotes": fixes.uncurl_quotes,
87 "fix_line_breaks": fixes.fix_line_breaks,
88 "fix_surrogates": fixes.fix_surrogates,
89 "remove_control_chars": fixes.remove_control_chars,
90}
93class TextFixerConfig(NamedTuple):
94 r"""
95 A TextFixerConfig object stores configuration options for ftfy.
97 It's implemented as a namedtuple with defaults, so you can instantiate
98 it by providing the values to change from their defaults as keyword arguments.
99 For example, to disable 'unescape_html' and keep the rest of the defaults::
101 TextFixerConfig(unescape_html=False)
103 Here are the options and their default values:
105 - `unescape_html`: "auto"
107 Configures whether to replace HTML entities such as & with the character
108 they represent. "auto" says to do this by default, but disable it when a
109 literal < character appears, indicating that the input is actual HTML and
110 entities should be preserved. The value can be True, to always enable this
111 fixer, or False, to always disable it.
113 - `remove_terminal_escapes`: True
115 Removes "ANSI" terminal escapes, such as for changing the color of text in a
116 terminal window.
118 - `fix_encoding`: True
120 Detect mojibake and attempt to fix it by decoding the text in a different
121 encoding standard.
123 The following four options affect `fix_encoding` works, and do nothing if
124 `fix_encoding` is False:
126 - `restore_byte_a0`: True
128 Allow a literal space (U+20) to be interpreted as a non-breaking space
129 (U+A0) when that would make it part of a fixable mojibake string.
131 Because spaces are very common characters, this could lead to false
132 positives, but we try to apply it only when there's strong evidence for
133 mojibake. Disabling `restore_byte_a0` is safer from false positives,
134 but creates false negatives.
136 - `replace_lossy_sequences`: True
138 Detect mojibake that has been partially replaced by the characters
139 '�' or '?'. If the mojibake could be decoded otherwise, replace the
140 detected sequence with '�'.
142 - `decode_inconsistent_utf8`: True
144 When we see sequences that distinctly look like UTF-8 mojibake, but
145 there's no consistent way to reinterpret the string in a new encoding,
146 replace the mojibake with the appropriate UTF-8 characters anyway.
148 This helps to decode strings that are concatenated from different
149 encodings.
151 - `fix_c1_controls`: True
153 Replace C1 control characters (the useless characters U+80 - U+9B that
154 come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
155 even if the whole string doesn't decode as Latin-1.
157 - `fix_latin_ligatures`: True
159 Replace common Latin-alphabet ligatures, such as ``fi``, with the
160 letters they're made of.
162 - `fix_character_width`: True
164 Replace fullwidth Latin characters and halfwidth Katakana with
165 their more standard widths.
167 - `uncurl_quotes`: True
169 Replace curly quotes with straight quotes.
171 - `fix_line_breaks`: True
173 Replace various forms of line breaks with the standard Unix line
174 break, ``\n``.
176 - `fix_surrogates`: True
178 Replace sequences of UTF-16 surrogate codepoints with the character
179 they were meant to encode. This fixes text that was decoded with the
180 obsolete UCS-2 standard, and allows it to support high-numbered
181 codepoints such as emoji.
183 - `remove_control_chars`: True
185 Remove certain control characters that have no displayed effect on text.
187 - `normalization`: "NFC"
189 Choose what kind of Unicode normalization is applied. Usually, we apply
190 NFC normalization, so that letters followed by combining characters become
191 single combined characters.
193 Changing this to "NFKC" applies more compatibility conversions, such as
194 replacing the 'micro sign' with a standard Greek lowercase mu, which looks
195 identical. However, some NFKC normalizations change the meaning of text,
196 such as converting "10³" to "103".
198 `normalization` can be None, to apply no normalization.
200 - `max_decode_length`: 1_000_000
202 The maximum size of "segment" that ftfy will try to fix all at once.
204 - `explain`: True
206 Whether to compute 'explanations', lists describing what ftfy changed.
207 When this is False, the explanation will be None, and the code that
208 builds the explanation will be skipped, possibly saving time.
210 Functions that accept TextFixerConfig and don't return an explanation
211 will automatically set `explain` to False.
212 """
213 unescape_html: Union[str, bool] = "auto"
214 remove_terminal_escapes: bool = True
215 fix_encoding: bool = True
216 restore_byte_a0: bool = True
217 replace_lossy_sequences: bool = True
218 decode_inconsistent_utf8: bool = True
219 fix_c1_controls: bool = True
220 fix_latin_ligatures: bool = True
221 fix_character_width: bool = True
222 uncurl_quotes: bool = True
223 fix_line_breaks: bool = True
224 fix_surrogates: bool = True
225 remove_control_chars: bool = True
226 normalization: Optional[str] = "NFC"
227 max_decode_length: int = 1000000
228 explain: bool = True
231def _config_from_kwargs(
232 config: TextFixerConfig, kwargs: Dict[str, Any]
233) -> TextFixerConfig:
234 """
235 Handle parameters provided as keyword arguments to ftfy's top-level
236 functions, converting them into a TextFixerConfig.
237 """
238 if "fix_entities" in kwargs:
239 warnings.warn(
240 "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning
241 )
242 kwargs = kwargs.copy()
243 kwargs["unescape_html"] = kwargs["fix_entities"]
244 del kwargs["fix_entities"]
245 config = config._replace(**kwargs)
246 return config
249BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
251ftfy is designed to fix problems with text. Treating bytes like they're
252interchangeable with Unicode text is usually something that introduces
253problems with text.
255You should first decode these bytes from the encoding you think they're in.
256If you're not sure what encoding they're in:
258- First, try to find out. 'utf-8' is a good assumption.
259- If the encoding is simply unknowable, try running your bytes through
260 ftfy.guess_bytes. As the name implies, this may not always be accurate.
262For more information on the distinction between bytes and text, read the
263Python Unicode HOWTO:
265 http://docs.python.org/3/howto/unicode.html
266"""
269def _try_fix(
270 fixer_name: str,
271 text: str,
272 config: TextFixerConfig,
273 steps: Optional[List[ExplanationStep]],
274) -> str:
275 """
276 A helper function used across several 'fixer' steps, deciding whether to
277 apply the fix and whether to record the fix in `steps`.
278 """
279 if getattr(config, fixer_name):
280 fixer = FIXERS[fixer_name]
281 fixed = fixer(text)
282 if steps is not None and fixed != text:
283 steps.append(ExplanationStep("apply", fixer_name))
284 return cast(str, fixed)
286 return text
289def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
290 r"""
291 Given Unicode text as input, fix inconsistencies and glitches in it,
292 such as mojibake (text that was decoded in the wrong encoding).
294 Let's start with some examples:
296 >>> fix_text('✔ No problems')
297 '✔ No problems'
299 >>> print(fix_text("¯\\_(ã\x83\x84)_/¯"))
300 ¯\_(ツ)_/¯
302 >>> fix_text('Broken text… it’s flubberific!')
303 "Broken text... it's flubberific!"
305 >>> fix_text('LOUD NOISES')
306 'LOUD NOISES'
308 ftfy applies a number of different fixes to the text, and can accept
309 configuration to select which fixes to apply.
311 The configuration takes the form of a :class:`TextFixerConfig` object,
312 and you can see a description of the options in that class's docstring
313 or in the full documentation at ftfy.readthedocs.org.
315 For convenience and backward compatibility, the configuration can also
316 take the form of keyword arguments, which will set the equivalently-named
317 fields of the TextFixerConfig object.
319 For example, here are two ways to fix text but skip the "uncurl_quotes"
320 step::
322 fix_text(text, TextFixerConfig(uncurl_quotes=False))
323 fix_text(text, uncurl_quotes=False)
325 This function fixes text in independent segments, which are usually lines
326 of text, or arbitrarily broken up every 1 million codepoints (configurable
327 with `config.max_decode_length`) if there aren't enough line breaks. The
328 bound on segment lengths helps to avoid unbounded slowdowns.
330 ftfy can also provide an 'explanation', a list of transformations it applied
331 to the text that would fix more text like it. This function doesn't provide
332 explanations (because there may be different fixes for different segments
333 of text).
335 To get an explanation, use the :func:`fix_and_explain()` function, which
336 fixes the string in one segment and explains what it fixed.
337 """
339 if config is None:
340 config = TextFixerConfig(explain=False)
341 config = _config_from_kwargs(config, kwargs)
342 if isinstance(text, bytes):
343 raise UnicodeError(BYTES_ERROR_TEXT)
345 out = []
346 pos = 0
347 while pos < len(text):
348 textbreak = text.find("\n", pos) + 1
349 if textbreak == 0:
350 textbreak = len(text)
351 if (textbreak - pos) > config.max_decode_length:
352 textbreak = pos + config.max_decode_length
354 segment = text[pos:textbreak]
355 if config.unescape_html == "auto" and "<" in segment:
356 config = config._replace(unescape_html=False)
357 fixed_segment, _ = fix_and_explain(segment, config)
358 out.append(fixed_segment)
359 pos = textbreak
360 return "".join(out)
363def fix_and_explain(
364 text: str, config: Optional[TextFixerConfig] = None, **kwargs
365) -> ExplainedText:
366 """
367 Fix text as a single segment, returning the fixed text and an explanation
368 of what was fixed.
370 The explanation is a list of steps that can be applied with
371 :func:`apply_plan`, or if config.explain is False, it will be None.
372 """
373 if config is None:
374 config = TextFixerConfig()
375 if isinstance(text, bytes):
376 raise UnicodeError(BYTES_ERROR_TEXT)
377 config = _config_from_kwargs(config, kwargs)
379 if config.unescape_html == "auto" and "<" in text:
380 config = config._replace(unescape_html=False)
382 if config.explain:
383 steps: Optional[List[ExplanationStep]] = []
384 else:
385 # If explanations aren't desired, `steps` will be None
386 steps = None
388 while True:
389 origtext = text
391 text = _try_fix("unescape_html", text, config, steps)
393 if config.fix_encoding:
394 if steps is None:
395 text = fix_encoding(text)
396 else:
397 text, encoding_steps = fix_encoding_and_explain(text, config)
398 if encoding_steps is not None:
399 steps.extend(encoding_steps)
401 for fixer in [
402 "fix_c1_controls",
403 "fix_latin_ligatures",
404 "fix_character_width",
405 "uncurl_quotes",
406 "fix_line_breaks",
407 "fix_surrogates",
408 "remove_terminal_escapes",
409 "remove_control_chars",
410 ]:
411 text = _try_fix(fixer, text, config, steps)
413 if config.normalization is not None:
414 fixed = unicodedata.normalize(config.normalization, text)
415 if steps is not None and fixed != text:
416 steps.append(ExplanationStep("normalize", config.normalization))
417 text = fixed
419 if text == origtext:
420 return ExplainedText(text, steps)
423def fix_encoding_and_explain(
424 text: str, config: Optional[TextFixerConfig] = None, **kwargs
425) -> ExplainedText:
426 """
427 Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
428 text and a list explaining what was fixed.
430 This includes fixing text by encoding and decoding it in different encodings,
431 as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
432 `decode_inconsistent_utf8`, and `fix_c1_controls`.
434 Examples::
436 >>> fix_encoding_and_explain("só")
437 ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
439 >>> result = fix_encoding_and_explain("voilà le travail")
440 >>> result.text
441 'voilà le travail'
442 >>> result.explanation
443 [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
445 """
446 if config is None:
447 config = TextFixerConfig()
448 if isinstance(text, bytes):
449 raise UnicodeError(BYTES_ERROR_TEXT)
450 config = _config_from_kwargs(config, kwargs)
452 if not config.fix_encoding:
453 # A weird trivial case: we're asked to fix the encoding, but skip
454 # fixing the encoding
455 return ExplainedText(text, [])
457 plan_so_far: List[ExplanationStep] = []
458 while True:
459 prevtext = text
460 text, plan = _fix_encoding_one_step_and_explain(text, config)
461 if plan is not None:
462 plan_so_far.extend(plan)
463 if text == prevtext:
464 return ExplainedText(text, plan_so_far)
467def _fix_encoding_one_step_and_explain(
468 text: str, config: TextFixerConfig
469) -> ExplainedText:
470 """
471 Perform one step of fixing the encoding of text.
472 """
473 if config is None:
474 config = TextFixerConfig()
476 if len(text) == 0:
477 return ExplainedText(text, [])
479 # The first plan is to return ASCII text unchanged, as well as text
480 # that doesn't look like it contains mojibake
481 if chardata.possible_encoding(text, "ascii") or not is_bad(text):
482 return ExplainedText(text, [])
484 # As we go through the next step, remember the possible encodings
485 # that we encounter but don't successfully fix yet. We may need them
486 # later.
487 possible_1byte_encodings = []
489 # Suppose the text was supposed to be UTF-8, but it was decoded using
490 # a single-byte encoding instead. When these cases can be fixed, they
491 # are usually the correct thing to do, so try them next.
492 for encoding in chardata.CHARMAP_ENCODINGS:
493 if chardata.possible_encoding(text, encoding):
494 possible_1byte_encodings.append(encoding)
495 encoded_bytes = text.encode(encoding)
496 encode_step = ExplanationStep("encode", encoding)
497 transcode_steps = []
499 # Now, find out if it's UTF-8 (or close enough). Otherwise,
500 # remember the encoding for later.
501 try:
502 decoding = "utf-8"
503 # Check encoded_bytes for sequences that would be UTF-8,
504 # except they have b' ' where b'\xa0' would belong.
505 if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
506 encoded_bytes
507 ):
508 replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
509 if replaced_bytes != encoded_bytes:
510 transcode_steps.append(
511 ExplanationStep("transcode", "restore_byte_a0")
512 )
513 encoded_bytes = replaced_bytes
515 # Replace sequences where information has been lost
516 if config.replace_lossy_sequences and encoding.startswith("sloppy"):
517 replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
518 if replaced_bytes != encoded_bytes:
519 transcode_steps.append(
520 ExplanationStep("transcode", "replace_lossy_sequences")
521 )
522 encoded_bytes = replaced_bytes
524 if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
525 decoding = "utf-8-variants"
527 decode_step = ExplanationStep("decode", decoding)
528 steps = [encode_step] + transcode_steps + [decode_step]
529 fixed = encoded_bytes.decode(decoding)
530 return ExplainedText(fixed, steps)
532 except UnicodeDecodeError:
533 pass
535 # Look for a-hat-euro sequences that remain, and fix them in isolation.
536 if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
537 steps = [ExplanationStep("apply", "decode_inconsistent_utf8")]
538 fixed = fixes.decode_inconsistent_utf8(text)
539 if fixed != text:
540 return ExplainedText(fixed, steps)
542 # The next most likely case is that this is Latin-1 that was intended to
543 # be read as Windows-1252, because those two encodings in particular are
544 # easily confused.
545 if "latin-1" in possible_1byte_encodings:
546 if "windows-1252" in possible_1byte_encodings:
547 # This text is in the intersection of Latin-1 and
548 # Windows-1252, so it's probably legit.
549 return ExplainedText(text, [])
550 else:
551 # Otherwise, it means we have characters that are in Latin-1 but
552 # not in Windows-1252. Those are C1 control characters. Nobody
553 # wants those. Assume they were meant to be Windows-1252.
554 try:
555 fixed = text.encode("latin-1").decode("windows-1252")
556 if fixed != text:
557 steps = [
558 ExplanationStep("encode", "latin-1"),
559 ExplanationStep("decode", "windows-1252"),
560 ]
561 return ExplainedText(fixed, steps)
562 except UnicodeDecodeError:
563 pass
565 # Fix individual characters of Latin-1 with a less satisfying explanation
566 if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
567 steps = [ExplanationStep("transcode", "fix_c1_controls")]
568 fixed = fixes.fix_c1_controls(text)
569 return ExplainedText(fixed, steps)
571 # The cases that remain are mixups between two different single-byte
572 # encodings, and not the common case of Latin-1 vs. Windows-1252.
573 #
574 # With the new heuristic in 6.0, it's possible that we're closer to solving
575 # these in some cases. It would require a lot of testing and tuning, though.
576 # For now, we leave the text unchanged in these cases.
577 return ExplainedText(text, [])
580def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs):
581 """
582 Apply just the encoding-fixing steps of ftfy to this text. Returns the
583 fixed text, discarding the explanation.
585 >>> fix_encoding("ó")
586 'ó'
587 >>> fix_encoding("&ATILDE;&SUP3;")
588 '&ATILDE;&SUP3;'
589 """
590 if config is None:
591 config = TextFixerConfig(explain=False)
592 config = _config_from_kwargs(config, kwargs)
593 fixed, _explan = fix_encoding_and_explain(text, config)
594 return fixed
597# Some alternate names for the main functions
598ftfy = fix_text
601def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs):
602 """
603 Fix text as a single segment, with a consistent sequence of steps that
604 are applied to fix the text. Discard the explanation.
605 """
606 if config is None:
607 config = TextFixerConfig(explain=False)
608 config = _config_from_kwargs(config, kwargs)
609 fixed, _explan = fix_and_explain(text, config)
610 return fixed
613def fix_file(
614 input_file: TextIO,
615 encoding: Optional[str] = None,
616 config: Optional[TextFixerConfig] = None,
617 **kwargs
618) -> Iterator[str]:
619 """
620 Fix text that is found in a file.
622 If the file is being read as Unicode text, use that. If it's being read as
623 bytes, then we hope an encoding was supplied. If not, unfortunately, we
624 have to guess what encoding it is. We'll try a few common encodings, but we
625 make no promises. See the `guess_bytes` function for how this is done.
627 The output is a stream of fixed lines of text.
628 """
629 if config is None:
630 config = TextFixerConfig()
631 config = _config_from_kwargs(config, kwargs)
633 for line in input_file:
634 if isinstance(line, bytes):
635 if encoding is None:
636 line, encoding = guess_bytes(line)
637 else:
638 line = line.decode(encoding)
639 if config.unescape_html == "auto" and "<" in line:
640 config = config._replace(unescape_html=False)
642 fixed_line, _explan = fix_and_explain(line, config)
643 yield fixed_line
646def guess_bytes(bstring: bytes) -> Tuple[str, str]:
647 """
648 NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
649 is not designed to be an encoding detector.
651 In the unfortunate situation that you have some bytes in an unknown
652 encoding, ftfy can guess a reasonable strategy for decoding them, by trying
653 a few common encodings that can be distinguished from each other.
655 Unlike the rest of ftfy, this may not be accurate, and it may *create*
656 Unicode problems instead of solving them!
658 The encodings we try here are:
660 - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
661 like nothing else
662 - UTF-8, because it's the global standard, which has been used by a
663 majority of the Web since 2008
664 - "utf-8-variants", or buggy implementations of UTF-8
665 - MacRoman, because Microsoft Office thinks it's still a thing, and it
666 can be distinguished by its line breaks. (If there are no line breaks in
667 the string, though, you're out of luck.)
668 - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
669 single-byte encoding.
670 """
671 if isinstance(bstring, str):
672 raise UnicodeError(
673 "This string was already decoded as Unicode. You should pass "
674 "bytes to guess_bytes, not Unicode."
675 )
677 if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
678 return bstring.decode("utf-16"), "utf-16"
680 byteset = set(bstring)
681 try:
682 if 0xED in byteset or 0xC0 in byteset:
683 # Byte 0xed can be used to encode a range of codepoints that
684 # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
685 # so when we see 0xed, it's very likely we're being asked to
686 # decode CESU-8, the variant that encodes UTF-16 surrogates
687 # instead of the original characters themselves.
688 #
689 # This will occasionally trigger on standard UTF-8, as there
690 # are some Korean characters that also use byte 0xed, but that's
691 # not harmful because standard UTF-8 characters will decode the
692 # same way in our 'utf-8-variants' codec.
693 #
694 # Byte 0xc0 is impossible because, numerically, it would only
695 # encode characters lower than U+0040. Those already have
696 # single-byte representations, and UTF-8 requires using the
697 # shortest possible representation. However, Java hides the null
698 # codepoint, U+0000, in a non-standard longer representation -- it
699 # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
700 # will never appear in the encoded bytes.
701 #
702 # The 'utf-8-variants' decoder can handle both of these cases, as
703 # well as standard UTF-8, at the cost of a bit of speed.
704 return bstring.decode("utf-8-variants"), "utf-8-variants"
705 else:
706 return bstring.decode("utf-8"), "utf-8"
707 except UnicodeDecodeError:
708 pass
710 if 0x0D in byteset and 0x0A not in byteset:
711 # Files that contain CR and not LF are likely to be MacRoman.
712 return bstring.decode("macroman"), "macroman"
714 return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
717@no_type_check
718def apply_plan(text: str, plan: List[Tuple[str, str]]):
719 """
720 Apply a plan for fixing the encoding of text.
722 The plan is a list of tuples of the form (operation, arg).
724 `operation` is one of:
726 - `'encode'`: convert a string to bytes, using `arg` as the encoding
727 - `'decode'`: convert bytes to a string, using `arg` as the encoding
728 - `'transcode'`: convert bytes to bytes, using the function named `arg`
729 - `'apply'`: convert a string to a string, using the function named `arg`
731 The functions that can be applied by 'transcode' and 'apply' are
732 specifically those that appear in the dictionary named `FIXERS`. They
733 can also can be imported from the `ftfy.fixes` module.
735 Example::
737 >>> mojibake = "schön"
738 >>> text, plan = fix_and_explain(mojibake)
739 >>> apply_plan(mojibake, plan)
740 'schön'
741 """
742 obj = text
743 for operation, encoding in plan:
744 if operation == "encode":
745 obj = obj.encode(encoding)
746 elif operation == "decode":
747 obj = obj.decode(encoding)
748 elif operation in ("transcode", "apply"):
749 if encoding in FIXERS:
750 obj = FIXERS[encoding](obj)
751 else:
752 raise ValueError("Unknown function to apply: %s" % encoding)
753 else:
754 raise ValueError("Unknown plan step: %s" % operation)
756 return obj
759def explain_unicode(text: str):
760 """
761 A utility method that's useful for debugging mysterious Unicode.
763 It breaks down a string, showing you for each codepoint its number in
764 hexadecimal, its glyph, its category in the Unicode standard, and its name
765 in the Unicode standard.
767 >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
768 U+0028 ( [Ps] LEFT PARENTHESIS
769 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
770 U+00B0 ° [So] DEGREE SIGN
771 U+25A1 □ [So] WHITE SQUARE
772 U+00B0 ° [So] DEGREE SIGN
773 U+0029 ) [Pe] RIGHT PARENTHESIS
774 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
775 U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
776 U+0020 [Zs] SPACE
777 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
778 U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
779 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
780 """
781 for char in text:
782 if char.isprintable():
783 display = char
784 else:
785 display = char.encode("unicode-escape").decode("ascii")
786 print(
787 "U+{code:04X} {display} [{category}] {name}".format(
788 display=display_ljust(display, 7),
789 code=ord(char),
790 category=unicodedata.category(char),
791 name=unicodedata.name(char, "<unknown>"),
792 )
793 )