Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/fixes.py: 46%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`
3can perform, and provides the functions that are named in "explanations"
4such as the output of :func:`ftfy.fix_and_explain`.
6Two of these functions are particularly useful on their own, as more robust
7versions of functions in the Python standard library:
9- :func:`ftfy.fixes.decode_escapes`
10- :func:`ftfy.fixes.unescape_html`
11"""
13import codecs
14import html
15import re
16import warnings
17from re import Match
18from typing import Any
20import ftfy
21from ftfy.badness import is_bad
22from ftfy.chardata import (
23 ALTERED_UTF8_RE,
24 C1_CONTROL_RE,
25 CONTROL_CHARS,
26 DOUBLE_QUOTE_RE,
27 HTML_ENTITIES,
28 HTML_ENTITY_RE,
29 LIGATURES,
30 LOSSY_UTF8_RE,
31 SINGLE_QUOTE_RE,
32 UTF8_DETECTOR_RE,
33 WIDTH_MAP,
34)
37def fix_encoding_and_explain(text: str) -> Any:
38 """
39 Deprecated copy of `ftfy.fix_encoding_and_explain()`.
40 """
41 warnings.warn(
42 "`fix_encoding_and_explain()` has moved to the main module of ftfy.",
43 DeprecationWarning,
44 stacklevel=2,
45 )
46 return ftfy.fix_encoding_and_explain(text)
49def fix_encoding(text: str) -> str:
50 """
51 Deprecated copy of `ftfy.fix_encoding()`.
52 """
53 warnings.warn(
54 "`fix_encoding()` has moved to the main module of ftfy.",
55 DeprecationWarning,
56 stacklevel=2,
57 )
58 return ftfy.fix_encoding(text)
61def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:
62 """
63 Deprecated copy of `ftfy.apply_plan()`.
64 """
65 warnings.warn(
66 "`apply_plan()` has moved to the main module of ftfy.",
67 DeprecationWarning,
68 stacklevel=2,
69 )
70 return ftfy.apply_plan(text, plan)
73def _unescape_fixup(match: Match[str]) -> str:
74 """
75 Replace one matched HTML entity with the character it represents,
76 if possible.
77 """
78 text = match.group(0)
79 if text in HTML_ENTITIES:
80 return HTML_ENTITIES[text]
81 elif text.startswith("&#"):
82 unescaped: str = html.unescape(text)
84 # If html.unescape only decoded part of the string, that's not what
85 # we want. The semicolon should be consumed.
86 if ";" in unescaped:
87 return text
88 else:
89 return unescaped
90 else:
91 return text
94def unescape_html(text: str) -> str:
95 """
96 Decode HTML entities and character references, including some nonstandard
97 ones written in all-caps.
99 Python has a built-in called `html.unescape` that can decode HTML escapes,
100 including a bunch of messy edge cases such as decoding escapes without
101 semicolons such as "&".
103 If you know you've got HTML-escaped text, applying `html.unescape` is the
104 right way to convert it to plain text. But in ambiguous situations, that
105 would create false positives. For example, the informally written text
106 "this¬ that" should not automatically be decoded as "this¬ that".
108 In this function, we decode the escape sequences that appear in the
109 `html.entities.html5` dictionary, as long as they are the unambiguous ones
110 that end in semicolons.
112 We also decode all-caps versions of Latin letters and common symbols.
113 If a database contains the name 'PÉREZ', we can read that and intuit
114 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
115 entities, because there are many instances where entity names are
116 case-sensitive in complicated ways.
118 >>> unescape_html('<tag>')
119 '<tag>'
121 >>> unescape_html('𝒥ohn ℋancock')
122 '𝒥ohn ℋancock'
124 >>> unescape_html('✓')
125 '✓'
127 >>> unescape_html('Pérez')
128 'Pérez'
130 >>> unescape_html('P&EACUTE;REZ')
131 'PÉREZ'
133 >>> unescape_html('BUNDESSTRA&SZLIG;E')
134 'BUNDESSTRASSE'
136 >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;')
137 'ñ Ñ Ñ &nTILDE;'
138 """
139 return HTML_ENTITY_RE.sub(_unescape_fixup, text)
142ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")
145def remove_terminal_escapes(text: str) -> str:
146 r"""
147 Strip out "ANSI" terminal escape sequences, such as those that produce
148 colored text on Unix.
150 >>> print(remove_terminal_escapes(
151 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
152 ... ))
153 I'm blue, da ba dee da ba doo...
154 """
155 return ANSI_RE.sub("", text)
158def uncurl_quotes(text: str) -> str:
159 r"""
160 Replace curly quotation marks with straight equivalents.
162 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
163 "here's a test"
164 """
165 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
168def fix_latin_ligatures(text: str) -> str:
169 """
170 Replace single-character ligatures of Latin letters, such as 'fi', with the
171 characters that they contain, as in 'fi'. Latin ligatures are usually not
172 intended in text strings (though they're lovely in *rendered* text). If
173 you have such a ligature in your string, it is probably a result of a
174 copy-and-paste glitch.
176 We leave ligatures in other scripts alone to be safe. They may be intended,
177 and removing them may lose information. If you want to take apart nearly
178 all ligatures, use NFKC normalization.
180 >>> print(fix_latin_ligatures("fluffiest"))
181 fluffiest
182 """
183 return text.translate(LIGATURES)
186def fix_character_width(text: str) -> str:
187 """
188 The ASCII characters, katakana, and Hangul characters have alternate
189 "halfwidth" or "fullwidth" forms that help text line up in a grid.
191 If you don't need these width properties, you probably want to replace
192 these characters with their standard form, which is what this function
193 does.
195 Note that this replaces the ideographic space, U+3000, with the ASCII
196 space, U+20.
198 >>> print(fix_character_width("LOUD NOISES"))
199 LOUD NOISES
200 >>> print(fix_character_width("Uターン")) # this means "U-turn"
201 Uターン
202 """
203 return text.translate(WIDTH_MAP)
206def fix_line_breaks(text: str) -> str:
207 r"""
208 Convert all line breaks to Unix style.
210 This will convert the following sequences into the standard \\n
211 line break:
213 - CRLF (\\r\\n), used on Windows and in some communication protocols
214 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided
215 software such as Microsoft Office for Mac
216 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by
217 Unicode and used to sow confusion and discord
218 - NEXT LINE (\\x85), a C1 control character that is certainly not what you
219 meant
221 The NEXT LINE character is a bit of an odd case, because it
222 usually won't show up if `fix_encoding` is also being run.
223 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
225 >>> print(fix_line_breaks(
226 ... "This string is made of two things:\u2029"
227 ... "1. Unicode\u2028"
228 ... "2. Spite"
229 ... ))
230 This string is made of two things:
231 1. Unicode
232 2. Spite
234 For further testing and examples, let's define a function to make sure
235 we can see the control characters in their escaped form:
237 >>> def eprint(text):
238 ... print(text.encode('unicode-escape').decode('ascii'))
240 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
241 Content-type: text/plain\n\nHi.
243 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
244 This is how Microsoft \n trolls Mac users
246 >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
247 What is this \n I don't even
248 """
249 return (
250 text.replace("\r\n", "\n")
251 .replace("\r", "\n")
252 .replace("\u2028", "\n")
253 .replace("\u2029", "\n")
254 .replace("\u0085", "\n")
255 )
258SURROGATE_RE = re.compile("[\ud800-\udfff]")
259SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
262def convert_surrogate_pair(match: Match[str]) -> str:
263 """
264 Convert a surrogate pair to the single codepoint it represents.
266 This implements the formula described at:
267 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
268 """
269 pair = match.group(0)
270 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)
271 return chr(codept)
274def fix_surrogates(text: str) -> str:
275 """
276 Replace 16-bit surrogate codepoints with the characters they represent
277 (when properly paired), or with \ufffd otherwise.
279 >>> high_surrogate = chr(0xd83d)
280 >>> low_surrogate = chr(0xdca9)
281 >>> print(fix_surrogates(high_surrogate + low_surrogate))
282 💩
283 >>> print(fix_surrogates(low_surrogate + high_surrogate))
284 ��
286 The above doctest had to be very carefully written, because even putting
287 the Unicode escapes of the surrogates in the docstring was causing
288 various tools to fail, which I think just goes to show why this fixer is
289 necessary.
290 """
291 if SURROGATE_RE.search(text):
292 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
293 text = SURROGATE_RE.sub("\ufffd", text)
294 return text
297def remove_control_chars(text: str) -> str:
298 """
299 Remove various control characters that you probably didn't intend to be in
300 your text. Many of these characters appear in the table of "Characters not
301 suitable for use with markup" at
302 http://www.unicode.org/reports/tr20/tr20-9.html.
304 This includes:
306 - ASCII control characters, except for the important whitespace characters
307 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F)
308 - Deprecated Arabic control characters (U+206A to U+206F)
309 - Interlinear annotation characters (U+FFF9 to U+FFFB)
310 - The Object Replacement Character (U+FFFC)
311 - The byte order mark (U+FEFF)
313 However, these similar characters are left alone:
315 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,
316 U+2028, and U+2029)
317 - C1 control characters (U+80 to U+9F) -- even though they are basically
318 never used intentionally, they are important clues about what mojibake
319 has happened
320 - Control characters that affect glyph rendering, such as joiners and
321 right-to-left marks (U+200C to U+200F, U+202A to U+202E)
322 - Musical notation control characters (U+1D173 to U+1D17A) because wow if
323 you're using those you probably have a good reason
324 - Tag characters, because they are now used in emoji sequences such as
325 "Flag of Wales"
326 """
327 return text.translate(CONTROL_CHARS)
330def remove_bom(text: str) -> str:
331 r"""
332 Remove a byte-order mark that was accidentally decoded as if it were part
333 of the text.
335 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))
336 Where do you want to go today?
337 """
338 return text.lstrip(chr(0xFEFF))
341# Define a regex to match valid escape sequences in Python string literals.
342ESCAPE_SEQUENCE_RE = re.compile(
343 r"""
344 ( \\U........ # 8-digit hex escapes
345 | \\u.... # 4-digit hex escapes
346 | \\x.. # 2-digit hex escapes
347 | \\[0-7]{1,3} # Octal escapes
348 | \\N\{[^}]+\} # Unicode characters by name
349 | \\[\\'"abfnrtv] # Single-character escapes
350 )""",
351 re.UNICODE | re.VERBOSE,
352)
355def decode_escapes(text: str) -> str:
356 r"""
357 Decode backslashed escape sequences, including \\x, \\u, and \\U character
358 references, even in the presence of other Unicode.
360 This function has to be called specifically. It's not run automatically by
361 ftfy, because escaped text is not necessarily a mistake, and there is no
362 way to distinguish when it is.
364 This is what Python's "string-escape" and "unicode-escape" codecs were
365 meant to do, but in contrast, this actually works. It will decode the
366 string exactly the same way that the Python interpreter decodes its string
367 literals.
369 >>> factoid = '\\u20a1 is the currency symbol for the colón.'
370 >>> print(factoid[1:])
371 u20a1 is the currency symbol for the colón.
372 >>> print(decode_escapes(factoid))
373 ₡ is the currency symbol for the colón.
375 Even though Python itself can read string literals with a combination of
376 escapes and literal Unicode -- you're looking at one right now -- the
377 "unicode-escape" codec doesn't work on literal Unicode. (See
378 http://stackoverflow.com/a/24519338/773754 for more details.)
380 Instead, this function searches for just the parts of a string that
381 represent escape sequences, and decodes them, leaving the rest alone. All
382 valid escape sequences are made of ASCII characters, and this allows
383 "unicode-escape" to work correctly.
384 """
386 def decode_match(match: Match[str]) -> str:
387 "Given a regex match, decode the escape sequence it contains."
388 return codecs.decode(match.group(0), "unicode-escape")
390 return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
393# This regex implements an exception to restore_byte_a0, so we can decode the
394# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
395# mode".
396#
397# If byte C3 appears with a single space after it -- most commonly this shows
398# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
399# the space. Without this change, we would decode "à" as the start of the next
400# word, such as "àla". It's almost always intended to be a separate word, as in
401# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
402# get coalesced into "Ã la".
403#
404# We make exceptions for the Portuguese words "às", "àquele", "àquela",
405# "àquilo" and their plurals -- these are contractions of, for example, "a
406# aquele" and are very common. Note that the final letter is important to
407# distinguish this case from French "à quel point".
408#
409# Other instances in Portuguese, such as "àfrica", seem to be typos (intended
410# to be "África" with the accent in the other direction).
411#
412# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
413# contain it will end up with inserted spaces. We can't do the right thing with
414# every word. The cost is that the mojibake text "fà cil" will be interpreted as
415# "fà cil", not "fàcil".
416A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")
419def restore_byte_a0(byts: bytes) -> bytes:
420 """
421 Some mojibake has been additionally altered by a process that said "hmm,
422 byte A0, that's basically a space!" and replaced it with an ASCII space.
423 When the A0 is part of a sequence that we intend to decode as UTF-8,
424 changing byte A0 to 20 would make it fail to decode.
426 This process finds sequences that would convincingly decode as UTF-8 if
427 byte 20 were changed to A0, and puts back the A0. For the purpose of
428 deciding whether this is a good idea, this step gets a cost of twice
429 the number of bytes that are changed.
431 This is used as a step within `fix_encoding`.
432 """
433 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)
435 def replacement(match: Match[bytes]) -> bytes:
436 "The function to apply when this regex matches."
437 return match.group(0).replace(b"\x20", b"\xa0")
439 return ALTERED_UTF8_RE.sub(replacement, byts)
442def replace_lossy_sequences(byts: bytes) -> bytes:
443 """
444 This function identifies sequences where information has been lost in
445 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look
446 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.
448 A further explanation:
450 ftfy can now fix text in a few cases that it would previously fix
451 incompletely, because of the fact that it can't successfully apply the fix
452 to the entire string. A very common case of this is when characters have
453 been erroneously decoded as windows-1252, but instead of the "sloppy"
454 windows-1252 that passes through unassigned bytes, the unassigned bytes get
455 turned into U+FFFD (�), so we can't tell what they were.
457 This most commonly happens with curly quotation marks that appear
458 ``“ like this �``.
460 We can do better by building on ftfy's "sloppy codecs" to let them handle
461 less-sloppy but more-lossy text. When they encounter the character ``�``,
462 instead of refusing to encode it, they encode it as byte 1A -- an
463 ASCII control code called SUBSTITUTE that once was meant for about the same
464 purpose. We can then apply a fixer that looks for UTF-8 sequences where
465 some continuation bytes have been replaced by byte 1A, and decode the whole
466 sequence as �; if that doesn't work, it'll just turn the byte back into �
467 itself.
469 As a result, the above text ``“ like this �`` will decode as
470 ``“ like this �``.
472 If U+1A was actually in the original string, then the sloppy codecs will
473 not be used, and this function will not be run, so your weird control
474 character will be left alone but wacky fixes like this won't be possible.
476 This is used as a transcoder within `fix_encoding`.
477 """
478 return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts)
481def decode_inconsistent_utf8(text: str) -> str:
482 """
483 Sometimes, text from one encoding ends up embedded within text from a
484 different one. This is common enough that we need to be able to fix it.
486 This is used as a transcoder within `fix_encoding`.
487 """
489 def fix_embedded_mojibake(match: Match[str]) -> str:
490 substr = match.group(0)
492 # Require the match to be shorter, so that this doesn't recurse infinitely
493 if len(substr) < len(text) and is_bad(substr):
494 return ftfy.fix_encoding(substr)
495 else:
496 return substr
498 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)
501def _c1_fixer(match: Match[str]) -> str:
502 return match.group(0).encode("latin-1").decode("sloppy-windows-1252")
505def fix_c1_controls(text: str) -> str:
506 """
507 If text still contains C1 control characters, treat them as their
508 Windows-1252 equivalents. This matches what Web browsers do.
509 """
510 return C1_CONTROL_RE.sub(_c1_fixer, text)