Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/fixes.py: 82%

78 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:33 +0000

1""" 

2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text` 

3can perform, and provides the functions that are named in "explanations" 

4such as the output of :func:`ftfy.fix_and_explain`. 

5 

6Two of these functions are particularly useful on their own, as more robust 

7versions of functions in the Python standard library: 

8 

9- :func:`ftfy.fixes.decode_escapes` 

10- :func:`ftfy.fixes.unescape_html` 

11""" 

12 

13import codecs 

14import html 

15import re 

16import warnings 

17 

18import ftfy 

19from ftfy.chardata import ( 

20 ALTERED_UTF8_RE, 

21 C1_CONTROL_RE, 

22 CONTROL_CHARS, 

23 DOUBLE_QUOTE_RE, 

24 HTML_ENTITIES, 

25 HTML_ENTITY_RE, 

26 LIGATURES, 

27 LOSSY_UTF8_RE, 

28 SINGLE_QUOTE_RE, 

29 UTF8_DETECTOR_RE, 

30 WIDTH_MAP, 

31) 

32 

33from ftfy.badness import is_bad 

34 

35 

36def fix_encoding_and_explain(text): 

37 """ 

38 Deprecated copy of `ftfy.fix_encoding_and_explain()`. 

39 """ 

40 warnings.warn( 

41 "`fix_encoding_and_explain()` has moved to the main module of ftfy.", 

42 DeprecationWarning, 

43 ) 

44 return ftfy.fix_encoding_and_explain(text) 

45 

46 

47def fix_encoding(text): 

48 """ 

49 Deprecated copy of `ftfy.fix_encoding()`. 

50 """ 

51 warnings.warn( 

52 "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning 

53 ) 

54 return ftfy.fix_encoding(text) 

55 

56 

57def apply_plan(text, plan): 

58 """ 

59 Deprecated copy of `ftfy.apply_plan()`. 

60 """ 

61 warnings.warn( 

62 "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning 

63 ) 

64 return ftfy.apply_plan(text, plan) 

65 

66 

67def _unescape_fixup(match): 

68 """ 

69 Replace one matched HTML entity with the character it represents, 

70 if possible. 

71 """ 

72 text = match.group(0) 

73 if text in HTML_ENTITIES: 

74 return HTML_ENTITIES[text] 

75 elif text.startswith("&#"): 

76 unescaped = html.unescape(text) 

77 

78 # If html.unescape only decoded part of the string, that's not what 

79 # we want. The semicolon should be consumed. 

80 if ";" in unescaped: 

81 return text 

82 else: 

83 return unescaped 

84 else: 

85 return text 

86 

87 

88def unescape_html(text): 

89 """ 

90 Decode HTML entities and character references, including some nonstandard 

91 ones written in all-caps. 

92 

93 Python has a built-in called `html.unescape` that can decode HTML escapes, 

94 including a bunch of messy edge cases such as decoding escapes without 

95 semicolons such as "&amp". 

96 

97 If you know you've got HTML-escaped text, applying `html.unescape` is the 

98 right way to convert it to plain text. But in ambiguous situations, that 

99 would create false positives. For example, the informally written text 

100 "this&not that" should not automatically be decoded as "this¬ that". 

101 

102 In this function, we decode the escape sequences that appear in the 

103 `html.entities.html5` dictionary, as long as they are the unambiguous ones 

104 that end in semicolons. 

105 

106 We also decode all-caps versions of Latin letters and common symbols. 

107 If a database contains the name 'PÉREZ', we can read that and intuit 

108 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of 

109 entities, because there are many instances where entity names are 

110 case-sensitive in complicated ways. 

111 

112 >>> unescape_html('<tag>') 

113 '<tag>' 

114 

115 >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock') 

116 '𝒥ohn ℋancock' 

117 

118 >>> unescape_html('&checkmark;') 

119 '✓' 

120 

121 >>> unescape_html('P&eacute;rez') 

122 'Pérez' 

123 

124 >>> unescape_html('P&EACUTE;REZ') 

125 'PÉREZ' 

126 

127 >>> unescape_html('BUNDESSTRA&SZLIG;E') 

128 'BUNDESSTRASSE' 

129 

130 >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;') 

131 'ñ Ñ Ñ &nTILDE;' 

132 """ 

133 return HTML_ENTITY_RE.sub(_unescape_fixup, text) 

134 

135 

136ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])") 

137 

138 

139def remove_terminal_escapes(text): 

140 r""" 

141 Strip out "ANSI" terminal escape sequences, such as those that produce 

142 colored text on Unix. 

143 

144 >>> print(remove_terminal_escapes( 

145 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m" 

146 ... )) 

147 I'm blue, da ba dee da ba doo... 

148 """ 

149 return ANSI_RE.sub("", text) 

150 

151 

152def uncurl_quotes(text): 

153 r""" 

154 Replace curly quotation marks with straight equivalents. 

155 

156 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d')) 

157 "here's a test" 

158 """ 

159 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text)) 

160 

161 

162def fix_latin_ligatures(text): 

163 """ 

164 Replace single-character ligatures of Latin letters, such as 'fi', with the 

165 characters that they contain, as in 'fi'. Latin ligatures are usually not 

166 intended in text strings (though they're lovely in *rendered* text). If 

167 you have such a ligature in your string, it is probably a result of a 

168 copy-and-paste glitch. 

169 

170 We leave ligatures in other scripts alone to be safe. They may be intended, 

171 and removing them may lose information. If you want to take apart nearly 

172 all ligatures, use NFKC normalization. 

173 

174 >>> print(fix_latin_ligatures("fluffiest")) 

175 fluffiest 

176 """ 

177 return text.translate(LIGATURES) 

178 

179 

180def fix_character_width(text): 

181 """ 

182 The ASCII characters, katakana, and Hangul characters have alternate 

183 "halfwidth" or "fullwidth" forms that help text line up in a grid. 

184 

185 If you don't need these width properties, you probably want to replace 

186 these characters with their standard form, which is what this function 

187 does. 

188 

189 Note that this replaces the ideographic space, U+3000, with the ASCII 

190 space, U+20. 

191 

192 >>> print(fix_character_width("LOUD NOISES")) 

193 LOUD NOISES 

194 >>> print(fix_character_width("Uターン")) # this means "U-turn" 

195 Uターン 

196 """ 

197 return text.translate(WIDTH_MAP) 

198 

199 

200def fix_line_breaks(text): 

201 r""" 

202 Convert all line breaks to Unix style. 

203 

204 This will convert the following sequences into the standard \\n 

205 line break: 

206 

207 - CRLF (\\r\\n), used on Windows and in some communication protocols 

208 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided 

209 software such as Microsoft Office for Mac 

210 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by 

211 Unicode and used to sow confusion and discord 

212 - NEXT LINE (\\x85), a C1 control character that is certainly not what you 

213 meant 

214 

215 The NEXT LINE character is a bit of an odd case, because it 

216 usually won't show up if `fix_encoding` is also being run. 

217 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS. 

218 

219 >>> print(fix_line_breaks( 

220 ... "This string is made of two things:\u2029" 

221 ... "1. Unicode\u2028" 

222 ... "2. Spite" 

223 ... )) 

224 This string is made of two things: 

225 1. Unicode 

226 2. Spite 

227 

228 For further testing and examples, let's define a function to make sure 

229 we can see the control characters in their escaped form: 

230 

231 >>> def eprint(text): 

232 ... print(text.encode('unicode-escape').decode('ascii')) 

233 

234 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi.")) 

235 Content-type: text/plain\n\nHi. 

236 

237 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users")) 

238 This is how Microsoft \n trolls Mac users 

239 

240 >>> eprint(fix_line_breaks("What is this \x85 I don't even")) 

241 What is this \n I don't even 

242 """ 

243 return ( 

244 text.replace("\r\n", "\n") 

245 .replace("\r", "\n") 

246 .replace("\u2028", "\n") 

247 .replace("\u2029", "\n") 

248 .replace("\u0085", "\n") 

249 ) 

250 

251 

252SURROGATE_RE = re.compile("[\ud800-\udfff]") 

253SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]") 

254 

255 

256def convert_surrogate_pair(match): 

257 """ 

258 Convert a surrogate pair to the single codepoint it represents. 

259 

260 This implements the formula described at: 

261 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates 

262 """ 

263 pair = match.group(0) 

264 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00) 

265 return chr(codept) 

266 

267 

268def fix_surrogates(text): 

269 """ 

270 Replace 16-bit surrogate codepoints with the characters they represent 

271 (when properly paired), or with \ufffd otherwise. 

272 

273 >>> high_surrogate = chr(0xd83d) 

274 >>> low_surrogate = chr(0xdca9) 

275 >>> print(fix_surrogates(high_surrogate + low_surrogate)) 

276 💩 

277 >>> print(fix_surrogates(low_surrogate + high_surrogate)) 

278 �� 

279 

280 The above doctest had to be very carefully written, because even putting 

281 the Unicode escapes of the surrogates in the docstring was causing 

282 various tools to fail, which I think just goes to show why this fixer is 

283 necessary. 

284 """ 

285 if SURROGATE_RE.search(text): 

286 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text) 

287 text = SURROGATE_RE.sub("\ufffd", text) 

288 return text 

289 

290 

291def remove_control_chars(text): 

292 """ 

293 Remove various control characters that you probably didn't intend to be in 

294 your text. Many of these characters appear in the table of "Characters not 

295 suitable for use with markup" at 

296 http://www.unicode.org/reports/tr20/tr20-9.html. 

297 

298 This includes: 

299 

300 - ASCII control characters, except for the important whitespace characters 

301 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F) 

302 - Deprecated Arabic control characters (U+206A to U+206F) 

303 - Interlinear annotation characters (U+FFF9 to U+FFFB) 

304 - The Object Replacement Character (U+FFFC) 

305 - The byte order mark (U+FEFF) 

306 

307 However, these similar characters are left alone: 

308 

309 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D, 

310 U+2028, and U+2029) 

311 - C1 control characters (U+80 to U+9F) -- even though they are basically 

312 never used intentionally, they are important clues about what mojibake 

313 has happened 

314 - Control characters that affect glyph rendering, such as joiners and 

315 right-to-left marks (U+200C to U+200F, U+202A to U+202E) 

316 - Musical notation control characters (U+1D173 to U+1D17A) because wow if 

317 you're using those you probably have a good reason 

318 - Tag characters, because they are now used in emoji sequences such as 

319 "Flag of Wales" 

320 """ 

321 return text.translate(CONTROL_CHARS) 

322 

323 

324def remove_bom(text): 

325 r""" 

326 Remove a byte-order mark that was accidentally decoded as if it were part 

327 of the text. 

328 

329 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?")) 

330 Where do you want to go today? 

331 """ 

332 return text.lstrip(chr(0xFEFF)) 

333 

334 

335# Define a regex to match valid escape sequences in Python string literals. 

336ESCAPE_SEQUENCE_RE = re.compile( 

337 r""" 

338 ( \\U........ # 8-digit hex escapes 

339 | \\u.... # 4-digit hex escapes 

340 | \\x.. # 2-digit hex escapes 

341 | \\[0-7]{1,3} # Octal escapes 

342 | \\N\{[^}]+\} # Unicode characters by name 

343 | \\[\\'"abfnrtv] # Single-character escapes 

344 )""", 

345 re.UNICODE | re.VERBOSE, 

346) 

347 

348 

349def decode_escapes(text): 

350 r""" 

351 Decode backslashed escape sequences, including \\x, \\u, and \\U character 

352 references, even in the presence of other Unicode. 

353 

354 This function has to be called specifically. It's not run automatically by 

355 ftfy, because escaped text is not necessarily a mistake, and there is no 

356 way to distinguish when it is. 

357 

358 This is what Python's "string-escape" and "unicode-escape" codecs were 

359 meant to do, but in contrast, this actually works. It will decode the 

360 string exactly the same way that the Python interpreter decodes its string 

361 literals. 

362 

363 >>> factoid = '\\u20a1 is the currency symbol for the colón.' 

364 >>> print(factoid[1:]) 

365 u20a1 is the currency symbol for the colón. 

366 >>> print(decode_escapes(factoid)) 

367 ₡ is the currency symbol for the colón. 

368 

369 Even though Python itself can read string literals with a combination of 

370 escapes and literal Unicode -- you're looking at one right now -- the 

371 "unicode-escape" codec doesn't work on literal Unicode. (See 

372 http://stackoverflow.com/a/24519338/773754 for more details.) 

373 

374 Instead, this function searches for just the parts of a string that 

375 represent escape sequences, and decodes them, leaving the rest alone. All 

376 valid escape sequences are made of ASCII characters, and this allows 

377 "unicode-escape" to work correctly. 

378 """ 

379 

380 def decode_match(match): 

381 "Given a regex match, decode the escape sequence it contains." 

382 return codecs.decode(match.group(0), "unicode-escape") 

383 

384 return ESCAPE_SEQUENCE_RE.sub(decode_match, text) 

385 

386 

387# This regex implements an exception to restore_byte_a0, so we can decode the 

388# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla 

389# mode". 

390# 

391# If byte C3 appears with a single space after it -- most commonly this shows 

392# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping 

393# the space. Without this change, we would decode "à" as the start of the next 

394# word, such as "àla". It's almost always intended to be a separate word, as in 

395# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces 

396# get coalesced into "Ã la". 

397# 

398# We make exceptions for the Portuguese words "às", "àquele", "àquela", 

399# "àquilo" and their plurals -- these are contractions of, for example, "a 

400# aquele" and are very common. Note that the final letter is important to 

401# distinguish this case from French "à quel point". 

402# 

403# Other instances in Portuguese, such as "àfrica", seem to be typos (intended 

404# to be "África" with the accent in the other direction). 

405# 

406# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that 

407# contain it will end up with inserted spaces. We can't do the right thing with 

408# every word. The cost is that the mojibake text "fà cil" will be interpreted as 

409# "fà cil", not "fàcil". 

410A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )") 

411 

412 

413def restore_byte_a0(byts): 

414 """ 

415 Some mojibake has been additionally altered by a process that said "hmm, 

416 byte A0, that's basically a space!" and replaced it with an ASCII space. 

417 When the A0 is part of a sequence that we intend to decode as UTF-8, 

418 changing byte A0 to 20 would make it fail to decode. 

419 

420 This process finds sequences that would convincingly decode as UTF-8 if 

421 byte 20 were changed to A0, and puts back the A0. For the purpose of 

422 deciding whether this is a good idea, this step gets a cost of twice 

423 the number of bytes that are changed. 

424 

425 This is used as a step within `fix_encoding`. 

426 """ 

427 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts) 

428 

429 def replacement(match): 

430 "The function to apply when this regex matches." 

431 return match.group(0).replace(b"\x20", b"\xa0") 

432 

433 return ALTERED_UTF8_RE.sub(replacement, byts) 

434 

435 

436def replace_lossy_sequences(byts): 

437 """ 

438 This function identifies sequences where information has been lost in 

439 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look 

440 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD. 

441 

442 A further explanation: 

443 

444 ftfy can now fix text in a few cases that it would previously fix 

445 incompletely, because of the fact that it can't successfully apply the fix 

446 to the entire string. A very common case of this is when characters have 

447 been erroneously decoded as windows-1252, but instead of the "sloppy" 

448 windows-1252 that passes through unassigned bytes, the unassigned bytes get 

449 turned into U+FFFD (�), so we can't tell what they were. 

450 

451 This most commonly happens with curly quotation marks that appear 

452 ``“ like this â€�``. 

453 

454 We can do better by building on ftfy's "sloppy codecs" to let them handle 

455 less-sloppy but more-lossy text. When they encounter the character ``�``, 

456 instead of refusing to encode it, they encode it as byte 1A -- an 

457 ASCII control code called SUBSTITUTE that once was meant for about the same 

458 purpose. We can then apply a fixer that looks for UTF-8 sequences where 

459 some continuation bytes have been replaced by byte 1A, and decode the whole 

460 sequence as �; if that doesn't work, it'll just turn the byte back into � 

461 itself. 

462 

463 As a result, the above text ``“ like this â€�`` will decode as 

464 ``“ like this �``. 

465 

466 If U+1A was actually in the original string, then the sloppy codecs will 

467 not be used, and this function will not be run, so your weird control 

468 character will be left alone but wacky fixes like this won't be possible. 

469 

470 This is used as a transcoder within `fix_encoding`. 

471 """ 

472 return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts) 

473 

474 

475def decode_inconsistent_utf8(text): 

476 """ 

477 Sometimes, text from one encoding ends up embedded within text from a 

478 different one. This is common enough that we need to be able to fix it. 

479 

480 This is used as a transcoder within `fix_encoding`. 

481 """ 

482 

483 def fix_embedded_mojibake(match): 

484 substr = match.group(0) 

485 

486 # Require the match to be shorter, so that this doesn't recurse infinitely 

487 if len(substr) < len(text) and is_bad(substr): 

488 return ftfy.fix_encoding(substr) 

489 else: 

490 return substr 

491 

492 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text) 

493 

494 

495def _c1_fixer(match): 

496 return match.group(0).encode("latin-1").decode("sloppy-windows-1252") 

497 

498 

499def fix_c1_controls(text): 

500 """ 

501 If text still contains C1 control characters, treat them as their 

502 Windows-1252 equivalents. This matches what Web browsers do. 

503 """ 

504 return C1_CONTROL_RE.sub(_c1_fixer, text)