Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/fixes.py: 82%

1"""

2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`

3can perform, and provides the functions that are named in "explanations"

4such as the output of :func:`ftfy.fix_and_explain`.

6Two of these functions are particularly useful on their own, as more robust

7versions of functions in the Python standard library:

9- :func:`ftfy.fixes.decode_escapes`

10- :func:`ftfy.fixes.unescape_html`

11"""

13import codecs

14import html

15import re

16import warnings

18import ftfy

19from ftfy.chardata import (

20 ALTERED_UTF8_RE,

21 C1_CONTROL_RE,

22 CONTROL_CHARS,

23 DOUBLE_QUOTE_RE,

24 HTML_ENTITIES,

25 HTML_ENTITY_RE,

26 LIGATURES,

27 LOSSY_UTF8_RE,

28 SINGLE_QUOTE_RE,

29 UTF8_DETECTOR_RE,

30 WIDTH_MAP,

31)

33from ftfy.badness import is_bad

36def fix_encoding_and_explain(text):

37 """

38 Deprecated copy of `ftfy.fix_encoding_and_explain()`.

39 """

40 warnings.warn(

41 "`fix_encoding_and_explain()` has moved to the main module of ftfy.",

42 DeprecationWarning,

43 )

44 return ftfy.fix_encoding_and_explain(text)

47def fix_encoding(text):

48 """

49 Deprecated copy of `ftfy.fix_encoding()`.

50 """

51 warnings.warn(

52 "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning

53 )

54 return ftfy.fix_encoding(text)

57def apply_plan(text, plan):

58 """

59 Deprecated copy of `ftfy.apply_plan()`.

60 """

61 warnings.warn(

62 "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning

63 )

64 return ftfy.apply_plan(text, plan)

67def _unescape_fixup(match):

68 """

69 Replace one matched HTML entity with the character it represents,

70 if possible.

71 """

72 text = match.group(0)

73 if text in HTML_ENTITIES:

74 return HTML_ENTITIES[text]

75 elif text.startswith("&#"):

76 unescaped = html.unescape(text)

78 # If html.unescape only decoded part of the string, that's not what

79 # we want. The semicolon should be consumed.

80 if ";" in unescaped:

81 return text

82 else:

83 return unescaped

84 else:

85 return text

88def unescape_html(text):

89 """

90 Decode HTML entities and character references, including some nonstandard

91 ones written in all-caps.

93 Python has a built-in called `html.unescape` that can decode HTML escapes,

94 including a bunch of messy edge cases such as decoding escapes without

95 semicolons such as "&amp".

97 If you know you've got HTML-escaped text, applying `html.unescape` is the

98 right way to convert it to plain text. But in ambiguous situations, that

99 would create false positives. For example, the informally written text

100 "this&not that" should not automatically be decoded as "this¬ that".

101

102 In this function, we decode the escape sequences that appear in the

103 `html.entities.html5` dictionary, as long as they are the unambiguous ones

104 that end in semicolons.

105

106 We also decode all-caps versions of Latin letters and common symbols.

107 If a database contains the name 'P&EACUTE;REZ', we can read that and intuit

108 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of

109 entities, because there are many instances where entity names are

110 case-sensitive in complicated ways.

111

112 >>> unescape_html('<tag>')

113 '<tag>'

114

115 >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')

116 '𝒥ohn ℋancock'

117

118 >>> unescape_html('&checkmark;')

119 '✓'

120

121 >>> unescape_html('Pérez')

122 'Pérez'

123

124 >>> unescape_html('P&EACUTE;REZ')

125 'PÉREZ'

126

127 >>> unescape_html('BUNDESSTRA&SZLIG;E')

128 'BUNDESSTRASSE'

129

130 >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;')

131 'ñ Ñ Ñ &nTILDE;'

132 """

133 return HTML_ENTITY_RE.sub(_unescape_fixup, text)

134

135

136ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")

137

138

139def remove_terminal_escapes(text):

140 r"""

141 Strip out "ANSI" terminal escape sequences, such as those that produce

142 colored text on Unix.

143

144 >>> print(remove_terminal_escapes(

145 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"

146 ... ))

147 I'm blue, da ba dee da ba doo...

148 """

149 return ANSI_RE.sub("", text)

150

151

152def uncurl_quotes(text):

153 r"""

154 Replace curly quotation marks with straight equivalents.

155

156 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))

157 "here's a test"

158 """

159 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))

160

161

162def fix_latin_ligatures(text):

163 """

164 Replace single-character ligatures of Latin letters, such as 'ﬁ', with the

165 characters that they contain, as in 'fi'. Latin ligatures are usually not

166 intended in text strings (though they're lovely in *rendered* text). If

167 you have such a ligature in your string, it is probably a result of a

168 copy-and-paste glitch.

169

170 We leave ligatures in other scripts alone to be safe. They may be intended,

171 and removing them may lose information. If you want to take apart nearly

172 all ligatures, use NFKC normalization.

173

174 >>> print(fix_latin_ligatures("ﬂuﬃeﬆ"))

175 fluffiest

176 """

177 return text.translate(LIGATURES)

178

179

180def fix_character_width(text):

181 """

182 The ASCII characters, katakana, and Hangul characters have alternate

183 "halfwidth" or "fullwidth" forms that help text line up in a grid.

184

185 If you don't need these width properties, you probably want to replace

186 these characters with their standard form, which is what this function

187 does.

188

189 Note that this replaces the ideographic space, U+3000, with the ASCII

190 space, U+20.

191

192 >>> print(fix_character_width("ＬＯＵＤ　ＮＯＩＳＥＳ"))

193 LOUD NOISES

194 >>> print(fix_character_width("Ｕﾀｰﾝ")) # this means "U-turn"

195 Uターン

196 """

197 return text.translate(WIDTH_MAP)

198

199

200def fix_line_breaks(text):

201 r"""

202 Convert all line breaks to Unix style.

203

204 This will convert the following sequences into the standard \\n

205 line break:

206

207 - CRLF (\\r\\n), used on Windows and in some communication protocols

208 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided

209 software such as Microsoft Office for Mac

210 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by

211 Unicode and used to sow confusion and discord

212 - NEXT LINE (\\x85), a C1 control character that is certainly not what you

213 meant

214

215 The NEXT LINE character is a bit of an odd case, because it

216 usually won't show up if `fix_encoding` is also being run.

217 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.

218

219 >>> print(fix_line_breaks(

220 ... "This string is made of two things:\u2029"

221 ... "1. Unicode\u2028"

222 ... "2. Spite"

223 ... ))

224 This string is made of two things:

225 1. Unicode

226 2. Spite

227

228 For further testing and examples, let's define a function to make sure

229 we can see the control characters in their escaped form:

230

231 >>> def eprint(text):

232 ... print(text.encode('unicode-escape').decode('ascii'))

233

234 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))

235 Content-type: text/plain\n\nHi.

236

237 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))

238 This is how Microsoft \n trolls Mac users

239

240 >>> eprint(fix_line_breaks("What is this \x85 I don't even"))

241 What is this \n I don't even

242 """

243 return (

244 text.replace("\r\n", "\n")

245 .replace("\r", "\n")

246 .replace("\u2028", "\n")

247 .replace("\u2029", "\n")

248 .replace("\u0085", "\n")

249 )

250

251

252SURROGATE_RE = re.compile("[\ud800-\udfff]")

253SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")

254

255

256def convert_surrogate_pair(match):

257 """

258 Convert a surrogate pair to the single codepoint it represents.

259

260 This implements the formula described at:

261 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates

262 """

263 pair = match.group(0)

264 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)

265 return chr(codept)

266

267

268def fix_surrogates(text):

269 """

270 Replace 16-bit surrogate codepoints with the characters they represent

271 (when properly paired), or with \ufffd otherwise.

272

273 >>> high_surrogate = chr(0xd83d)

274 >>> low_surrogate = chr(0xdca9)

275 >>> print(fix_surrogates(high_surrogate + low_surrogate))

276 💩

277 >>> print(fix_surrogates(low_surrogate + high_surrogate))

278 ��

279

280 The above doctest had to be very carefully written, because even putting

281 the Unicode escapes of the surrogates in the docstring was causing

282 various tools to fail, which I think just goes to show why this fixer is

283 necessary.

284 """

285 if SURROGATE_RE.search(text):

286 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)

287 text = SURROGATE_RE.sub("\ufffd", text)

288 return text

289

290

291def remove_control_chars(text):

292 """

293 Remove various control characters that you probably didn't intend to be in

294 your text. Many of these characters appear in the table of "Characters not

295 suitable for use with markup" at

296 http://www.unicode.org/reports/tr20/tr20-9.html.

297

298 This includes:

299

300 - ASCII control characters, except for the important whitespace characters

301 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F)

302 - Deprecated Arabic control characters (U+206A to U+206F)

303 - Interlinear annotation characters (U+FFF9 to U+FFFB)

304 - The Object Replacement Character (U+FFFC)

305 - The byte order mark (U+FEFF)

306

307 However, these similar characters are left alone:

308

309 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,

310 U+2028, and U+2029)

311 - C1 control characters (U+80 to U+9F) -- even though they are basically

312 never used intentionally, they are important clues about what mojibake

313 has happened

314 - Control characters that affect glyph rendering, such as joiners and

315 right-to-left marks (U+200C to U+200F, U+202A to U+202E)

316 - Musical notation control characters (U+1D173 to U+1D17A) because wow if

317 you're using those you probably have a good reason

318 - Tag characters, because they are now used in emoji sequences such as

319 "Flag of Wales"

320 """

321 return text.translate(CONTROL_CHARS)

322

323

324def remove_bom(text):

325 r"""

326 Remove a byte-order mark that was accidentally decoded as if it were part

327 of the text.

328

329 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))

330 Where do you want to go today?

331 """

332 return text.lstrip(chr(0xFEFF))

333

334

335# Define a regex to match valid escape sequences in Python string literals.

336ESCAPE_SEQUENCE_RE = re.compile(

337 r"""

338 ( \\U........ # 8-digit hex escapes

339 | \\u.... # 4-digit hex escapes

340 | \\x.. # 2-digit hex escapes

341 | \\[0-7]{1,3} # Octal escapes

342 | \\N\{[^}]+\} # Unicode characters by name

343 | \\[\\'"abfnrtv] # Single-character escapes

344 )""",

345 re.UNICODE | re.VERBOSE,

346)

347

348

349def decode_escapes(text):

350 r"""

351 Decode backslashed escape sequences, including \\x, \\u, and \\U character

352 references, even in the presence of other Unicode.

353

354 This function has to be called specifically. It's not run automatically by

355 ftfy, because escaped text is not necessarily a mistake, and there is no

356 way to distinguish when it is.

357

358 This is what Python's "string-escape" and "unicode-escape" codecs were

359 meant to do, but in contrast, this actually works. It will decode the

360 string exactly the same way that the Python interpreter decodes its string

361 literals.

362

363 >>> factoid = '\\u20a1 is the currency symbol for the colón.'

364 >>> print(factoid[1:])

365 u20a1 is the currency symbol for the colón.

366 >>> print(decode_escapes(factoid))

367 ₡ is the currency symbol for the colón.

368

369 Even though Python itself can read string literals with a combination of

370 escapes and literal Unicode -- you're looking at one right now -- the

371 "unicode-escape" codec doesn't work on literal Unicode. (See

372 http://stackoverflow.com/a/24519338/773754 for more details.)

373

374 Instead, this function searches for just the parts of a string that

375 represent escape sequences, and decodes them, leaving the rest alone. All

376 valid escape sequences are made of ASCII characters, and this allows

377 "unicode-escape" to work correctly.

378 """

379

380 def decode_match(match):

381 "Given a regex match, decode the escape sequence it contains."

382 return codecs.decode(match.group(0), "unicode-escape")

383

384 return ESCAPE_SEQUENCE_RE.sub(decode_match, text)

385

386

387# This regex implements an exception to restore_byte_a0, so we can decode the

388# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla

389# mode".

390#

391# If byte C3 appears with a single space after it -- most commonly this shows

392# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping

393# the space. Without this change, we would decode "à" as the start of the next

394# word, such as "àla". It's almost always intended to be a separate word, as in

395# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces

396# get coalesced into "Ã la".

397#

398# We make exceptions for the Portuguese words "às", "àquele", "àquela",

399# "àquilo" and their plurals -- these are contractions of, for example, "a

400# aquele" and are very common. Note that the final letter is important to

401# distinguish this case from French "à quel point".

402#

403# Other instances in Portuguese, such as "àfrica", seem to be typos (intended

404# to be "África" with the accent in the other direction).

405#

406# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that

407# contain it will end up with inserted spaces. We can't do the right thing with

408# every word. The cost is that the mojibake text "fÃ cil" will be interpreted as

409# "fà cil", not "fàcil".

410A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")

411

412

413def restore_byte_a0(byts):

414 """

415 Some mojibake has been additionally altered by a process that said "hmm,

416 byte A0, that's basically a space!" and replaced it with an ASCII space.

417 When the A0 is part of a sequence that we intend to decode as UTF-8,

418 changing byte A0 to 20 would make it fail to decode.

419

420 This process finds sequences that would convincingly decode as UTF-8 if

421 byte 20 were changed to A0, and puts back the A0. For the purpose of

422 deciding whether this is a good idea, this step gets a cost of twice

423 the number of bytes that are changed.

424

425 This is used as a step within `fix_encoding`.

426 """

427 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)

428

429 def replacement(match):

430 "The function to apply when this regex matches."

431 return match.group(0).replace(b"\x20", b"\xa0")

432

433 return ALTERED_UTF8_RE.sub(replacement, byts)

434

435

436def replace_lossy_sequences(byts):

437 """

438 This function identifies sequences where information has been lost in

439 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look

440 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.

441

442 A further explanation:

443

444 ftfy can now fix text in a few cases that it would previously fix

445 incompletely, because of the fact that it can't successfully apply the fix

446 to the entire string. A very common case of this is when characters have

447 been erroneously decoded as windows-1252, but instead of the "sloppy"

448 windows-1252 that passes through unassigned bytes, the unassigned bytes get

449 turned into U+FFFD (�), so we can't tell what they were.

450

451 This most commonly happens with curly quotation marks that appear

452 ``â€œ like this â€�``.

453

454 We can do better by building on ftfy's "sloppy codecs" to let them handle

455 less-sloppy but more-lossy text. When they encounter the character ``�``,

456 instead of refusing to encode it, they encode it as byte 1A -- an

457 ASCII control code called SUBSTITUTE that once was meant for about the same

458 purpose. We can then apply a fixer that looks for UTF-8 sequences where

459 some continuation bytes have been replaced by byte 1A, and decode the whole

460 sequence as �; if that doesn't work, it'll just turn the byte back into �

461 itself.

462

463 As a result, the above text ``â€œ like this â€�`` will decode as

464 ``“ like this �``.

465

466 If U+1A was actually in the original string, then the sloppy codecs will

467 not be used, and this function will not be run, so your weird control

468 character will be left alone but wacky fixes like this won't be possible.

469

470 This is used as a transcoder within `fix_encoding`.

471 """

472 return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts)

473

474

475def decode_inconsistent_utf8(text):

476 """

477 Sometimes, text from one encoding ends up embedded within text from a

478 different one. This is common enough that we need to be able to fix it.

479

480 This is used as a transcoder within `fix_encoding`.

481 """

482

483 def fix_embedded_mojibake(match):

484 substr = match.group(0)

485

486 # Require the match to be shorter, so that this doesn't recurse infinitely

487 if len(substr) < len(text) and is_bad(substr):

488 return ftfy.fix_encoding(substr)

489 else:

490 return substr

491

492 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)

493

494

495def _c1_fixer(match):

496 return match.group(0).encode("latin-1").decode("sloppy-windows-1252")

497

498

499def fix_c1_controls(text):

500 """

501 If text still contains C1 control characters, treat them as their

502 Windows-1252 equivalents. This matches what Web browsers do.

503 """

504 return C1_CONTROL_RE.sub(_c1_fixer, text)