Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/fixes.py: 46%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

81 statements  

1""" 

2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text` 

3can perform, and provides the functions that are named in "explanations" 

4such as the output of :func:`ftfy.fix_and_explain`. 

5 

6Two of these functions are particularly useful on their own, as more robust 

7versions of functions in the Python standard library: 

8 

9- :func:`ftfy.fixes.decode_escapes` 

10- :func:`ftfy.fixes.unescape_html` 

11""" 

12 

13import codecs 

14import html 

15import re 

16import warnings 

17from re import Match 

18from typing import Any 

19 

20import ftfy 

21from ftfy.badness import is_bad 

22from ftfy.chardata import ( 

23 ALTERED_UTF8_RE, 

24 C1_CONTROL_RE, 

25 CONTROL_CHARS, 

26 DOUBLE_QUOTE_RE, 

27 HTML_ENTITIES, 

28 HTML_ENTITY_RE, 

29 LIGATURES, 

30 LOSSY_UTF8_RE, 

31 SINGLE_QUOTE_RE, 

32 UTF8_DETECTOR_RE, 

33 WIDTH_MAP, 

34) 

35 

36 

37def fix_encoding_and_explain(text: str) -> Any: 

38 """ 

39 Deprecated copy of `ftfy.fix_encoding_and_explain()`. 

40 """ 

41 warnings.warn( 

42 "`fix_encoding_and_explain()` has moved to the main module of ftfy.", 

43 DeprecationWarning, 

44 stacklevel=2, 

45 ) 

46 return ftfy.fix_encoding_and_explain(text) 

47 

48 

49def fix_encoding(text: str) -> str: 

50 """ 

51 Deprecated copy of `ftfy.fix_encoding()`. 

52 """ 

53 warnings.warn( 

54 "`fix_encoding()` has moved to the main module of ftfy.", 

55 DeprecationWarning, 

56 stacklevel=2, 

57 ) 

58 return ftfy.fix_encoding(text) 

59 

60 

61def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: 

62 """ 

63 Deprecated copy of `ftfy.apply_plan()`. 

64 """ 

65 warnings.warn( 

66 "`apply_plan()` has moved to the main module of ftfy.", 

67 DeprecationWarning, 

68 stacklevel=2, 

69 ) 

70 return ftfy.apply_plan(text, plan) 

71 

72 

73def _unescape_fixup(match: Match[str]) -> str: 

74 """ 

75 Replace one matched HTML entity with the character it represents, 

76 if possible. 

77 """ 

78 text = match.group(0) 

79 if text in HTML_ENTITIES: 

80 return HTML_ENTITIES[text] 

81 elif text.startswith("&#"): 

82 unescaped: str = html.unescape(text) 

83 

84 # If html.unescape only decoded part of the string, that's not what 

85 # we want. The semicolon should be consumed. 

86 if ";" in unescaped: 

87 return text 

88 else: 

89 return unescaped 

90 else: 

91 return text 

92 

93 

94def unescape_html(text: str) -> str: 

95 """ 

96 Decode HTML entities and character references, including some nonstandard 

97 ones written in all-caps. 

98 

99 Python has a built-in called `html.unescape` that can decode HTML escapes, 

100 including a bunch of messy edge cases such as decoding escapes without 

101 semicolons such as "&amp". 

102 

103 If you know you've got HTML-escaped text, applying `html.unescape` is the 

104 right way to convert it to plain text. But in ambiguous situations, that 

105 would create false positives. For example, the informally written text 

106 "this&not that" should not automatically be decoded as "this¬ that". 

107 

108 In this function, we decode the escape sequences that appear in the 

109 `html.entities.html5` dictionary, as long as they are the unambiguous ones 

110 that end in semicolons. 

111 

112 We also decode all-caps versions of Latin letters and common symbols. 

113 If a database contains the name 'PÉREZ', we can read that and intuit 

114 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of 

115 entities, because there are many instances where entity names are 

116 case-sensitive in complicated ways. 

117 

118 >>> unescape_html('<tag>') 

119 '<tag>' 

120 

121 >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock') 

122 '𝒥ohn ℋancock' 

123 

124 >>> unescape_html('&checkmark;') 

125 '✓' 

126 

127 >>> unescape_html('P&eacute;rez') 

128 'Pérez' 

129 

130 >>> unescape_html('P&EACUTE;REZ') 

131 'PÉREZ' 

132 

133 >>> unescape_html('BUNDESSTRA&SZLIG;E') 

134 'BUNDESSTRASSE' 

135 

136 >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;') 

137 'ñ Ñ Ñ &nTILDE;' 

138 """ 

139 return HTML_ENTITY_RE.sub(_unescape_fixup, text) 

140 

141 

142ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])") 

143 

144 

145def remove_terminal_escapes(text: str) -> str: 

146 r""" 

147 Strip out "ANSI" terminal escape sequences, such as those that produce 

148 colored text on Unix. 

149 

150 >>> print(remove_terminal_escapes( 

151 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m" 

152 ... )) 

153 I'm blue, da ba dee da ba doo... 

154 """ 

155 return ANSI_RE.sub("", text) 

156 

157 

158def uncurl_quotes(text: str) -> str: 

159 r""" 

160 Replace curly quotation marks with straight equivalents. 

161 

162 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d')) 

163 "here's a test" 

164 """ 

165 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text)) 

166 

167 

168def fix_latin_ligatures(text: str) -> str: 

169 """ 

170 Replace single-character ligatures of Latin letters, such as 'fi', with the 

171 characters that they contain, as in 'fi'. Latin ligatures are usually not 

172 intended in text strings (though they're lovely in *rendered* text). If 

173 you have such a ligature in your string, it is probably a result of a 

174 copy-and-paste glitch. 

175 

176 We leave ligatures in other scripts alone to be safe. They may be intended, 

177 and removing them may lose information. If you want to take apart nearly 

178 all ligatures, use NFKC normalization. 

179 

180 >>> print(fix_latin_ligatures("fluffiest")) 

181 fluffiest 

182 """ 

183 return text.translate(LIGATURES) 

184 

185 

186def fix_character_width(text: str) -> str: 

187 """ 

188 The ASCII characters, katakana, and Hangul characters have alternate 

189 "halfwidth" or "fullwidth" forms that help text line up in a grid. 

190 

191 If you don't need these width properties, you probably want to replace 

192 these characters with their standard form, which is what this function 

193 does. 

194 

195 Note that this replaces the ideographic space, U+3000, with the ASCII 

196 space, U+20. 

197 

198 >>> print(fix_character_width("LOUD NOISES")) 

199 LOUD NOISES 

200 >>> print(fix_character_width("Uターン")) # this means "U-turn" 

201 Uターン 

202 """ 

203 return text.translate(WIDTH_MAP) 

204 

205 

206def fix_line_breaks(text: str) -> str: 

207 r""" 

208 Convert all line breaks to Unix style. 

209 

210 This will convert the following sequences into the standard \\n 

211 line break: 

212 

213 - CRLF (\\r\\n), used on Windows and in some communication protocols 

214 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided 

215 software such as Microsoft Office for Mac 

216 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by 

217 Unicode and used to sow confusion and discord 

218 - NEXT LINE (\\x85), a C1 control character that is certainly not what you 

219 meant 

220 

221 The NEXT LINE character is a bit of an odd case, because it 

222 usually won't show up if `fix_encoding` is also being run. 

223 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS. 

224 

225 >>> print(fix_line_breaks( 

226 ... "This string is made of two things:\u2029" 

227 ... "1. Unicode\u2028" 

228 ... "2. Spite" 

229 ... )) 

230 This string is made of two things: 

231 1. Unicode 

232 2. Spite 

233 

234 For further testing and examples, let's define a function to make sure 

235 we can see the control characters in their escaped form: 

236 

237 >>> def eprint(text): 

238 ... print(text.encode('unicode-escape').decode('ascii')) 

239 

240 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi.")) 

241 Content-type: text/plain\n\nHi. 

242 

243 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users")) 

244 This is how Microsoft \n trolls Mac users 

245 

246 >>> eprint(fix_line_breaks("What is this \x85 I don't even")) 

247 What is this \n I don't even 

248 """ 

249 return ( 

250 text.replace("\r\n", "\n") 

251 .replace("\r", "\n") 

252 .replace("\u2028", "\n") 

253 .replace("\u2029", "\n") 

254 .replace("\u0085", "\n") 

255 ) 

256 

257 

258SURROGATE_RE = re.compile("[\ud800-\udfff]") 

259SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]") 

260 

261 

262def convert_surrogate_pair(match: Match[str]) -> str: 

263 """ 

264 Convert a surrogate pair to the single codepoint it represents. 

265 

266 This implements the formula described at: 

267 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates 

268 """ 

269 pair = match.group(0) 

270 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00) 

271 return chr(codept) 

272 

273 

274def fix_surrogates(text: str) -> str: 

275 """ 

276 Replace 16-bit surrogate codepoints with the characters they represent 

277 (when properly paired), or with \ufffd otherwise. 

278 

279 >>> high_surrogate = chr(0xd83d) 

280 >>> low_surrogate = chr(0xdca9) 

281 >>> print(fix_surrogates(high_surrogate + low_surrogate)) 

282 💩 

283 >>> print(fix_surrogates(low_surrogate + high_surrogate)) 

284 �� 

285 

286 The above doctest had to be very carefully written, because even putting 

287 the Unicode escapes of the surrogates in the docstring was causing 

288 various tools to fail, which I think just goes to show why this fixer is 

289 necessary. 

290 """ 

291 if SURROGATE_RE.search(text): 

292 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text) 

293 text = SURROGATE_RE.sub("\ufffd", text) 

294 return text 

295 

296 

297def remove_control_chars(text: str) -> str: 

298 """ 

299 Remove various control characters that you probably didn't intend to be in 

300 your text. Many of these characters appear in the table of "Characters not 

301 suitable for use with markup" at 

302 http://www.unicode.org/reports/tr20/tr20-9.html. 

303 

304 This includes: 

305 

306 - ASCII control characters, except for the important whitespace characters 

307 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F) 

308 - Deprecated Arabic control characters (U+206A to U+206F) 

309 - Interlinear annotation characters (U+FFF9 to U+FFFB) 

310 - The Object Replacement Character (U+FFFC) 

311 - The byte order mark (U+FEFF) 

312 

313 However, these similar characters are left alone: 

314 

315 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D, 

316 U+2028, and U+2029) 

317 - C1 control characters (U+80 to U+9F) -- even though they are basically 

318 never used intentionally, they are important clues about what mojibake 

319 has happened 

320 - Control characters that affect glyph rendering, such as joiners and 

321 right-to-left marks (U+200C to U+200F, U+202A to U+202E) 

322 - Musical notation control characters (U+1D173 to U+1D17A) because wow if 

323 you're using those you probably have a good reason 

324 - Tag characters, because they are now used in emoji sequences such as 

325 "Flag of Wales" 

326 """ 

327 return text.translate(CONTROL_CHARS) 

328 

329 

330def remove_bom(text: str) -> str: 

331 r""" 

332 Remove a byte-order mark that was accidentally decoded as if it were part 

333 of the text. 

334 

335 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?")) 

336 Where do you want to go today? 

337 """ 

338 return text.lstrip(chr(0xFEFF)) 

339 

340 

341# Define a regex to match valid escape sequences in Python string literals. 

342ESCAPE_SEQUENCE_RE = re.compile( 

343 r""" 

344 ( \\U........ # 8-digit hex escapes 

345 | \\u.... # 4-digit hex escapes 

346 | \\x.. # 2-digit hex escapes 

347 | \\[0-7]{1,3} # Octal escapes 

348 | \\N\{[^}]+\} # Unicode characters by name 

349 | \\[\\'"abfnrtv] # Single-character escapes 

350 )""", 

351 re.UNICODE | re.VERBOSE, 

352) 

353 

354 

355def decode_escapes(text: str) -> str: 

356 r""" 

357 Decode backslashed escape sequences, including \\x, \\u, and \\U character 

358 references, even in the presence of other Unicode. 

359 

360 This function has to be called specifically. It's not run automatically by 

361 ftfy, because escaped text is not necessarily a mistake, and there is no 

362 way to distinguish when it is. 

363 

364 This is what Python's "string-escape" and "unicode-escape" codecs were 

365 meant to do, but in contrast, this actually works. It will decode the 

366 string exactly the same way that the Python interpreter decodes its string 

367 literals. 

368 

369 >>> factoid = '\\u20a1 is the currency symbol for the colón.' 

370 >>> print(factoid[1:]) 

371 u20a1 is the currency symbol for the colón. 

372 >>> print(decode_escapes(factoid)) 

373 ₡ is the currency symbol for the colón. 

374 

375 Even though Python itself can read string literals with a combination of 

376 escapes and literal Unicode -- you're looking at one right now -- the 

377 "unicode-escape" codec doesn't work on literal Unicode. (See 

378 http://stackoverflow.com/a/24519338/773754 for more details.) 

379 

380 Instead, this function searches for just the parts of a string that 

381 represent escape sequences, and decodes them, leaving the rest alone. All 

382 valid escape sequences are made of ASCII characters, and this allows 

383 "unicode-escape" to work correctly. 

384 """ 

385 

386 def decode_match(match: Match[str]) -> str: 

387 "Given a regex match, decode the escape sequence it contains." 

388 return codecs.decode(match.group(0), "unicode-escape") 

389 

390 return ESCAPE_SEQUENCE_RE.sub(decode_match, text) 

391 

392 

393# This regex implements an exception to restore_byte_a0, so we can decode the 

394# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla 

395# mode". 

396# 

397# If byte C3 appears with a single space after it -- most commonly this shows 

398# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping 

399# the space. Without this change, we would decode "à" as the start of the next 

400# word, such as "àla". It's almost always intended to be a separate word, as in 

401# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces 

402# get coalesced into "Ã la". 

403# 

404# We make exceptions for the Portuguese words "às", "àquele", "àquela", 

405# "àquilo" and their plurals -- these are contractions of, for example, "a 

406# aquele" and are very common. Note that the final letter is important to 

407# distinguish this case from French "à quel point". 

408# 

409# Other instances in Portuguese, such as "àfrica", seem to be typos (intended 

410# to be "África" with the accent in the other direction). 

411# 

412# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that 

413# contain it will end up with inserted spaces. We can't do the right thing with 

414# every word. The cost is that the mojibake text "fà cil" will be interpreted as 

415# "fà cil", not "fàcil". 

416A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )") 

417 

418 

419def restore_byte_a0(byts: bytes) -> bytes: 

420 """ 

421 Some mojibake has been additionally altered by a process that said "hmm, 

422 byte A0, that's basically a space!" and replaced it with an ASCII space. 

423 When the A0 is part of a sequence that we intend to decode as UTF-8, 

424 changing byte A0 to 20 would make it fail to decode. 

425 

426 This process finds sequences that would convincingly decode as UTF-8 if 

427 byte 20 were changed to A0, and puts back the A0. For the purpose of 

428 deciding whether this is a good idea, this step gets a cost of twice 

429 the number of bytes that are changed. 

430 

431 This is used as a step within `fix_encoding`. 

432 """ 

433 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts) 

434 

435 def replacement(match: Match[bytes]) -> bytes: 

436 "The function to apply when this regex matches." 

437 return match.group(0).replace(b"\x20", b"\xa0") 

438 

439 return ALTERED_UTF8_RE.sub(replacement, byts) 

440 

441 

442def replace_lossy_sequences(byts: bytes) -> bytes: 

443 """ 

444 This function identifies sequences where information has been lost in 

445 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look 

446 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD. 

447 

448 A further explanation: 

449 

450 ftfy can now fix text in a few cases that it would previously fix 

451 incompletely, because of the fact that it can't successfully apply the fix 

452 to the entire string. A very common case of this is when characters have 

453 been erroneously decoded as windows-1252, but instead of the "sloppy" 

454 windows-1252 that passes through unassigned bytes, the unassigned bytes get 

455 turned into U+FFFD (�), so we can't tell what they were. 

456 

457 This most commonly happens with curly quotation marks that appear 

458 ``“ like this â€�``. 

459 

460 We can do better by building on ftfy's "sloppy codecs" to let them handle 

461 less-sloppy but more-lossy text. When they encounter the character ``�``, 

462 instead of refusing to encode it, they encode it as byte 1A -- an 

463 ASCII control code called SUBSTITUTE that once was meant for about the same 

464 purpose. We can then apply a fixer that looks for UTF-8 sequences where 

465 some continuation bytes have been replaced by byte 1A, and decode the whole 

466 sequence as �; if that doesn't work, it'll just turn the byte back into � 

467 itself. 

468 

469 As a result, the above text ``“ like this â€�`` will decode as 

470 ``“ like this �``. 

471 

472 If U+1A was actually in the original string, then the sloppy codecs will 

473 not be used, and this function will not be run, so your weird control 

474 character will be left alone but wacky fixes like this won't be possible. 

475 

476 This is used as a transcoder within `fix_encoding`. 

477 """ 

478 return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts) 

479 

480 

481def decode_inconsistent_utf8(text: str) -> str: 

482 """ 

483 Sometimes, text from one encoding ends up embedded within text from a 

484 different one. This is common enough that we need to be able to fix it. 

485 

486 This is used as a transcoder within `fix_encoding`. 

487 """ 

488 

489 def fix_embedded_mojibake(match: Match[str]) -> str: 

490 substr = match.group(0) 

491 

492 # Require the match to be shorter, so that this doesn't recurse infinitely 

493 if len(substr) < len(text) and is_bad(substr): 

494 return ftfy.fix_encoding(substr) 

495 else: 

496 return substr 

497 

498 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text) 

499 

500 

501def _c1_fixer(match: Match[str]) -> str: 

502 return match.group(0).encode("latin-1").decode("sloppy-windows-1252") 

503 

504 

505def fix_c1_controls(text: str) -> str: 

506 """ 

507 If text still contains C1 control characters, treat them as their 

508 Windows-1252 equivalents. This matches what Web browsers do. 

509 """ 

510 return C1_CONTROL_RE.sub(_c1_fixer, text)