Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/fixes.py: 82%
78 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:33 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:33 +0000
1"""
2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`
3can perform, and provides the functions that are named in "explanations"
4such as the output of :func:`ftfy.fix_and_explain`.
6Two of these functions are particularly useful on their own, as more robust
7versions of functions in the Python standard library:
9- :func:`ftfy.fixes.decode_escapes`
10- :func:`ftfy.fixes.unescape_html`
11"""
13import codecs
14import html
15import re
16import warnings
18import ftfy
19from ftfy.chardata import (
20 ALTERED_UTF8_RE,
21 C1_CONTROL_RE,
22 CONTROL_CHARS,
23 DOUBLE_QUOTE_RE,
24 HTML_ENTITIES,
25 HTML_ENTITY_RE,
26 LIGATURES,
27 LOSSY_UTF8_RE,
28 SINGLE_QUOTE_RE,
29 UTF8_DETECTOR_RE,
30 WIDTH_MAP,
31)
33from ftfy.badness import is_bad
36def fix_encoding_and_explain(text):
37 """
38 Deprecated copy of `ftfy.fix_encoding_and_explain()`.
39 """
40 warnings.warn(
41 "`fix_encoding_and_explain()` has moved to the main module of ftfy.",
42 DeprecationWarning,
43 )
44 return ftfy.fix_encoding_and_explain(text)
47def fix_encoding(text):
48 """
49 Deprecated copy of `ftfy.fix_encoding()`.
50 """
51 warnings.warn(
52 "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning
53 )
54 return ftfy.fix_encoding(text)
57def apply_plan(text, plan):
58 """
59 Deprecated copy of `ftfy.apply_plan()`.
60 """
61 warnings.warn(
62 "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning
63 )
64 return ftfy.apply_plan(text, plan)
67def _unescape_fixup(match):
68 """
69 Replace one matched HTML entity with the character it represents,
70 if possible.
71 """
72 text = match.group(0)
73 if text in HTML_ENTITIES:
74 return HTML_ENTITIES[text]
75 elif text.startswith("&#"):
76 unescaped = html.unescape(text)
78 # If html.unescape only decoded part of the string, that's not what
79 # we want. The semicolon should be consumed.
80 if ";" in unescaped:
81 return text
82 else:
83 return unescaped
84 else:
85 return text
88def unescape_html(text):
89 """
90 Decode HTML entities and character references, including some nonstandard
91 ones written in all-caps.
93 Python has a built-in called `html.unescape` that can decode HTML escapes,
94 including a bunch of messy edge cases such as decoding escapes without
95 semicolons such as "&".
97 If you know you've got HTML-escaped text, applying `html.unescape` is the
98 right way to convert it to plain text. But in ambiguous situations, that
99 would create false positives. For example, the informally written text
100 "this¬ that" should not automatically be decoded as "this¬ that".
102 In this function, we decode the escape sequences that appear in the
103 `html.entities.html5` dictionary, as long as they are the unambiguous ones
104 that end in semicolons.
106 We also decode all-caps versions of Latin letters and common symbols.
107 If a database contains the name 'PÉREZ', we can read that and intuit
108 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
109 entities, because there are many instances where entity names are
110 case-sensitive in complicated ways.
112 >>> unescape_html('<tag>')
113 '<tag>'
115 >>> unescape_html('𝒥ohn ℋancock')
116 '𝒥ohn ℋancock'
118 >>> unescape_html('✓')
119 '✓'
121 >>> unescape_html('Pérez')
122 'Pérez'
124 >>> unescape_html('P&EACUTE;REZ')
125 'PÉREZ'
127 >>> unescape_html('BUNDESSTRA&SZLIG;E')
128 'BUNDESSTRASSE'
130 >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;')
131 'ñ Ñ Ñ &nTILDE;'
132 """
133 return HTML_ENTITY_RE.sub(_unescape_fixup, text)
136ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")
139def remove_terminal_escapes(text):
140 r"""
141 Strip out "ANSI" terminal escape sequences, such as those that produce
142 colored text on Unix.
144 >>> print(remove_terminal_escapes(
145 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
146 ... ))
147 I'm blue, da ba dee da ba doo...
148 """
149 return ANSI_RE.sub("", text)
152def uncurl_quotes(text):
153 r"""
154 Replace curly quotation marks with straight equivalents.
156 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
157 "here's a test"
158 """
159 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
162def fix_latin_ligatures(text):
163 """
164 Replace single-character ligatures of Latin letters, such as 'fi', with the
165 characters that they contain, as in 'fi'. Latin ligatures are usually not
166 intended in text strings (though they're lovely in *rendered* text). If
167 you have such a ligature in your string, it is probably a result of a
168 copy-and-paste glitch.
170 We leave ligatures in other scripts alone to be safe. They may be intended,
171 and removing them may lose information. If you want to take apart nearly
172 all ligatures, use NFKC normalization.
174 >>> print(fix_latin_ligatures("fluffiest"))
175 fluffiest
176 """
177 return text.translate(LIGATURES)
180def fix_character_width(text):
181 """
182 The ASCII characters, katakana, and Hangul characters have alternate
183 "halfwidth" or "fullwidth" forms that help text line up in a grid.
185 If you don't need these width properties, you probably want to replace
186 these characters with their standard form, which is what this function
187 does.
189 Note that this replaces the ideographic space, U+3000, with the ASCII
190 space, U+20.
192 >>> print(fix_character_width("LOUD NOISES"))
193 LOUD NOISES
194 >>> print(fix_character_width("Uターン")) # this means "U-turn"
195 Uターン
196 """
197 return text.translate(WIDTH_MAP)
200def fix_line_breaks(text):
201 r"""
202 Convert all line breaks to Unix style.
204 This will convert the following sequences into the standard \\n
205 line break:
207 - CRLF (\\r\\n), used on Windows and in some communication protocols
208 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided
209 software such as Microsoft Office for Mac
210 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by
211 Unicode and used to sow confusion and discord
212 - NEXT LINE (\\x85), a C1 control character that is certainly not what you
213 meant
215 The NEXT LINE character is a bit of an odd case, because it
216 usually won't show up if `fix_encoding` is also being run.
217 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
219 >>> print(fix_line_breaks(
220 ... "This string is made of two things:\u2029"
221 ... "1. Unicode\u2028"
222 ... "2. Spite"
223 ... ))
224 This string is made of two things:
225 1. Unicode
226 2. Spite
228 For further testing and examples, let's define a function to make sure
229 we can see the control characters in their escaped form:
231 >>> def eprint(text):
232 ... print(text.encode('unicode-escape').decode('ascii'))
234 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
235 Content-type: text/plain\n\nHi.
237 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
238 This is how Microsoft \n trolls Mac users
240 >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
241 What is this \n I don't even
242 """
243 return (
244 text.replace("\r\n", "\n")
245 .replace("\r", "\n")
246 .replace("\u2028", "\n")
247 .replace("\u2029", "\n")
248 .replace("\u0085", "\n")
249 )
252SURROGATE_RE = re.compile("[\ud800-\udfff]")
253SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
256def convert_surrogate_pair(match):
257 """
258 Convert a surrogate pair to the single codepoint it represents.
260 This implements the formula described at:
261 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
262 """
263 pair = match.group(0)
264 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)
265 return chr(codept)
268def fix_surrogates(text):
269 """
270 Replace 16-bit surrogate codepoints with the characters they represent
271 (when properly paired), or with \ufffd otherwise.
273 >>> high_surrogate = chr(0xd83d)
274 >>> low_surrogate = chr(0xdca9)
275 >>> print(fix_surrogates(high_surrogate + low_surrogate))
276 💩
277 >>> print(fix_surrogates(low_surrogate + high_surrogate))
278 ��
280 The above doctest had to be very carefully written, because even putting
281 the Unicode escapes of the surrogates in the docstring was causing
282 various tools to fail, which I think just goes to show why this fixer is
283 necessary.
284 """
285 if SURROGATE_RE.search(text):
286 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
287 text = SURROGATE_RE.sub("\ufffd", text)
288 return text
291def remove_control_chars(text):
292 """
293 Remove various control characters that you probably didn't intend to be in
294 your text. Many of these characters appear in the table of "Characters not
295 suitable for use with markup" at
296 http://www.unicode.org/reports/tr20/tr20-9.html.
298 This includes:
300 - ASCII control characters, except for the important whitespace characters
301 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F)
302 - Deprecated Arabic control characters (U+206A to U+206F)
303 - Interlinear annotation characters (U+FFF9 to U+FFFB)
304 - The Object Replacement Character (U+FFFC)
305 - The byte order mark (U+FEFF)
307 However, these similar characters are left alone:
309 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,
310 U+2028, and U+2029)
311 - C1 control characters (U+80 to U+9F) -- even though they are basically
312 never used intentionally, they are important clues about what mojibake
313 has happened
314 - Control characters that affect glyph rendering, such as joiners and
315 right-to-left marks (U+200C to U+200F, U+202A to U+202E)
316 - Musical notation control characters (U+1D173 to U+1D17A) because wow if
317 you're using those you probably have a good reason
318 - Tag characters, because they are now used in emoji sequences such as
319 "Flag of Wales"
320 """
321 return text.translate(CONTROL_CHARS)
324def remove_bom(text):
325 r"""
326 Remove a byte-order mark that was accidentally decoded as if it were part
327 of the text.
329 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))
330 Where do you want to go today?
331 """
332 return text.lstrip(chr(0xFEFF))
335# Define a regex to match valid escape sequences in Python string literals.
336ESCAPE_SEQUENCE_RE = re.compile(
337 r"""
338 ( \\U........ # 8-digit hex escapes
339 | \\u.... # 4-digit hex escapes
340 | \\x.. # 2-digit hex escapes
341 | \\[0-7]{1,3} # Octal escapes
342 | \\N\{[^}]+\} # Unicode characters by name
343 | \\[\\'"abfnrtv] # Single-character escapes
344 )""",
345 re.UNICODE | re.VERBOSE,
346)
349def decode_escapes(text):
350 r"""
351 Decode backslashed escape sequences, including \\x, \\u, and \\U character
352 references, even in the presence of other Unicode.
354 This function has to be called specifically. It's not run automatically by
355 ftfy, because escaped text is not necessarily a mistake, and there is no
356 way to distinguish when it is.
358 This is what Python's "string-escape" and "unicode-escape" codecs were
359 meant to do, but in contrast, this actually works. It will decode the
360 string exactly the same way that the Python interpreter decodes its string
361 literals.
363 >>> factoid = '\\u20a1 is the currency symbol for the colón.'
364 >>> print(factoid[1:])
365 u20a1 is the currency symbol for the colón.
366 >>> print(decode_escapes(factoid))
367 ₡ is the currency symbol for the colón.
369 Even though Python itself can read string literals with a combination of
370 escapes and literal Unicode -- you're looking at one right now -- the
371 "unicode-escape" codec doesn't work on literal Unicode. (See
372 http://stackoverflow.com/a/24519338/773754 for more details.)
374 Instead, this function searches for just the parts of a string that
375 represent escape sequences, and decodes them, leaving the rest alone. All
376 valid escape sequences are made of ASCII characters, and this allows
377 "unicode-escape" to work correctly.
378 """
380 def decode_match(match):
381 "Given a regex match, decode the escape sequence it contains."
382 return codecs.decode(match.group(0), "unicode-escape")
384 return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
387# This regex implements an exception to restore_byte_a0, so we can decode the
388# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
389# mode".
390#
391# If byte C3 appears with a single space after it -- most commonly this shows
392# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
393# the space. Without this change, we would decode "à" as the start of the next
394# word, such as "àla". It's almost always intended to be a separate word, as in
395# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
396# get coalesced into "Ã la".
397#
398# We make exceptions for the Portuguese words "às", "àquele", "àquela",
399# "àquilo" and their plurals -- these are contractions of, for example, "a
400# aquele" and are very common. Note that the final letter is important to
401# distinguish this case from French "à quel point".
402#
403# Other instances in Portuguese, such as "àfrica", seem to be typos (intended
404# to be "África" with the accent in the other direction).
405#
406# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
407# contain it will end up with inserted spaces. We can't do the right thing with
408# every word. The cost is that the mojibake text "fà cil" will be interpreted as
409# "fà cil", not "fàcil".
410A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")
413def restore_byte_a0(byts):
414 """
415 Some mojibake has been additionally altered by a process that said "hmm,
416 byte A0, that's basically a space!" and replaced it with an ASCII space.
417 When the A0 is part of a sequence that we intend to decode as UTF-8,
418 changing byte A0 to 20 would make it fail to decode.
420 This process finds sequences that would convincingly decode as UTF-8 if
421 byte 20 were changed to A0, and puts back the A0. For the purpose of
422 deciding whether this is a good idea, this step gets a cost of twice
423 the number of bytes that are changed.
425 This is used as a step within `fix_encoding`.
426 """
427 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)
429 def replacement(match):
430 "The function to apply when this regex matches."
431 return match.group(0).replace(b"\x20", b"\xa0")
433 return ALTERED_UTF8_RE.sub(replacement, byts)
436def replace_lossy_sequences(byts):
437 """
438 This function identifies sequences where information has been lost in
439 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look
440 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.
442 A further explanation:
444 ftfy can now fix text in a few cases that it would previously fix
445 incompletely, because of the fact that it can't successfully apply the fix
446 to the entire string. A very common case of this is when characters have
447 been erroneously decoded as windows-1252, but instead of the "sloppy"
448 windows-1252 that passes through unassigned bytes, the unassigned bytes get
449 turned into U+FFFD (�), so we can't tell what they were.
451 This most commonly happens with curly quotation marks that appear
452 ``“ like this �``.
454 We can do better by building on ftfy's "sloppy codecs" to let them handle
455 less-sloppy but more-lossy text. When they encounter the character ``�``,
456 instead of refusing to encode it, they encode it as byte 1A -- an
457 ASCII control code called SUBSTITUTE that once was meant for about the same
458 purpose. We can then apply a fixer that looks for UTF-8 sequences where
459 some continuation bytes have been replaced by byte 1A, and decode the whole
460 sequence as �; if that doesn't work, it'll just turn the byte back into �
461 itself.
463 As a result, the above text ``“ like this �`` will decode as
464 ``“ like this �``.
466 If U+1A was actually in the original string, then the sloppy codecs will
467 not be used, and this function will not be run, so your weird control
468 character will be left alone but wacky fixes like this won't be possible.
470 This is used as a transcoder within `fix_encoding`.
471 """
472 return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts)
475def decode_inconsistent_utf8(text):
476 """
477 Sometimes, text from one encoding ends up embedded within text from a
478 different one. This is common enough that we need to be able to fix it.
480 This is used as a transcoder within `fix_encoding`.
481 """
483 def fix_embedded_mojibake(match):
484 substr = match.group(0)
486 # Require the match to be shorter, so that this doesn't recurse infinitely
487 if len(substr) < len(text) and is_bad(substr):
488 return ftfy.fix_encoding(substr)
489 else:
490 return substr
492 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)
495def _c1_fixer(match):
496 return match.group(0).encode("latin-1").decode("sloppy-windows-1252")
499def fix_c1_controls(text):
500 """
501 If text still contains C1 control characters, treat them as their
502 Windows-1252 equivalents. This matches what Web browsers do.
503 """
504 return C1_CONTROL_RE.sub(_c1_fixer, text)