Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/fixes.py: 46%

1"""

2The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`

3can perform, and provides the functions that are named in "explanations"

4such as the output of :func:`ftfy.fix_and_explain`.

6Two of these functions are particularly useful on their own, as more robust

7versions of functions in the Python standard library:

9- :func:`ftfy.fixes.decode_escapes`

10- :func:`ftfy.fixes.unescape_html`

11"""

13import codecs

14import html

15import re

16import warnings

17from re import Match

18from typing import Any

20import ftfy

21from ftfy.badness import is_bad

22from ftfy.chardata import (

23 ALTERED_UTF8_RE,

24 C1_CONTROL_RE,

25 CONTROL_CHARS,

26 DOUBLE_QUOTE_RE,

27 HTML_ENTITIES,

28 HTML_ENTITY_RE,

29 LIGATURES,

30 LOSSY_UTF8_RE,

31 SINGLE_QUOTE_RE,

32 UTF8_DETECTOR_RE,

33 WIDTH_MAP,

34)

37def fix_encoding_and_explain(text: str) -> Any:

38 """

39 Deprecated copy of `ftfy.fix_encoding_and_explain()`.

40 """

41 warnings.warn(

42 "`fix_encoding_and_explain()` has moved to the main module of ftfy.",

43 DeprecationWarning,

44 stacklevel=2,

45 )

46 return ftfy.fix_encoding_and_explain(text)

49def fix_encoding(text: str) -> str:

50 """

51 Deprecated copy of `ftfy.fix_encoding()`.

52 """

53 warnings.warn(

54 "`fix_encoding()` has moved to the main module of ftfy.",

55 DeprecationWarning,

56 stacklevel=2,

57 )

58 return ftfy.fix_encoding(text)

61def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:

62 """

63 Deprecated copy of `ftfy.apply_plan()`.

64 """

65 warnings.warn(

66 "`apply_plan()` has moved to the main module of ftfy.",

67 DeprecationWarning,

68 stacklevel=2,

69 )

70 return ftfy.apply_plan(text, plan)

73def _unescape_fixup(match: Match[str]) -> str:

74 """

75 Replace one matched HTML entity with the character it represents,

76 if possible.

77 """

78 text = match.group(0)

79 if text in HTML_ENTITIES:

80 return HTML_ENTITIES[text]

81 elif text.startswith("&#"):

82 unescaped: str = html.unescape(text)

84 # If html.unescape only decoded part of the string, that's not what

85 # we want. The semicolon should be consumed.

86 if ";" in unescaped:

87 return text

88 else:

89 return unescaped

90 else:

91 return text

94def unescape_html(text: str) -> str:

95 """

96 Decode HTML entities and character references, including some nonstandard

97 ones written in all-caps.

99 Python has a built-in called `html.unescape` that can decode HTML escapes,

100 including a bunch of messy edge cases such as decoding escapes without

101 semicolons such as "&amp".

102

103 If you know you've got HTML-escaped text, applying `html.unescape` is the

104 right way to convert it to plain text. But in ambiguous situations, that

105 would create false positives. For example, the informally written text

106 "this&not that" should not automatically be decoded as "this¬ that".

107

108 In this function, we decode the escape sequences that appear in the

109 `html.entities.html5` dictionary, as long as they are the unambiguous ones

110 that end in semicolons.

111

112 We also decode all-caps versions of Latin letters and common symbols.

113 If a database contains the name 'P&EACUTE;REZ', we can read that and intuit

114 that it was supposed to say 'PÉREZ'. This is limited to a smaller set of

115 entities, because there are many instances where entity names are

116 case-sensitive in complicated ways.

117

118 >>> unescape_html('<tag>')

119 '<tag>'

120

121 >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')

122 '𝒥ohn ℋancock'

123

124 >>> unescape_html('&checkmark;')

125 '✓'

126

127 >>> unescape_html('Pérez')

128 'Pérez'

129

130 >>> unescape_html('P&EACUTE;REZ')

131 'PÉREZ'

132

133 >>> unescape_html('BUNDESSTRA&SZLIG;E')

134 'BUNDESSTRASSE'

135

136 >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;')

137 'ñ Ñ Ñ &nTILDE;'

138 """

139 return HTML_ENTITY_RE.sub(_unescape_fixup, text)

140

141

142ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")

143

144

145def remove_terminal_escapes(text: str) -> str:

146 r"""

147 Strip out "ANSI" terminal escape sequences, such as those that produce

148 colored text on Unix.

149

150 >>> print(remove_terminal_escapes(

151 ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"

152 ... ))

153 I'm blue, da ba dee da ba doo...

154 """

155 return ANSI_RE.sub("", text)

156

157

158def uncurl_quotes(text: str) -> str:

159 r"""

160 Replace curly quotation marks with straight equivalents.

161

162 >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))

163 "here's a test"

164 """

165 return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))

166

167

168def fix_latin_ligatures(text: str) -> str:

169 """

170 Replace single-character ligatures of Latin letters, such as 'ﬁ', with the

171 characters that they contain, as in 'fi'. Latin ligatures are usually not

172 intended in text strings (though they're lovely in *rendered* text). If

173 you have such a ligature in your string, it is probably a result of a

174 copy-and-paste glitch.

175

176 We leave ligatures in other scripts alone to be safe. They may be intended,

177 and removing them may lose information. If you want to take apart nearly

178 all ligatures, use NFKC normalization.

179

180 >>> print(fix_latin_ligatures("ﬂuﬃeﬆ"))

181 fluffiest

182 """

183 return text.translate(LIGATURES)

184

185

186def fix_character_width(text: str) -> str:

187 """

188 The ASCII characters, katakana, and Hangul characters have alternate

189 "halfwidth" or "fullwidth" forms that help text line up in a grid.

190

191 If you don't need these width properties, you probably want to replace

192 these characters with their standard form, which is what this function

193 does.

194

195 Note that this replaces the ideographic space, U+3000, with the ASCII

196 space, U+20.

197

198 >>> print(fix_character_width("ＬＯＵＤ　ＮＯＩＳＥＳ"))

199 LOUD NOISES

200 >>> print(fix_character_width("Ｕﾀｰﾝ")) # this means "U-turn"

201 Uターン

202 """

203 return text.translate(WIDTH_MAP)

204

205

206def fix_line_breaks(text: str) -> str:

207 r"""

208 Convert all line breaks to Unix style.

209

210 This will convert the following sequences into the standard \\n

211 line break:

212

213 - CRLF (\\r\\n), used on Windows and in some communication protocols

214 - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided

215 software such as Microsoft Office for Mac

216 - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by

217 Unicode and used to sow confusion and discord

218 - NEXT LINE (\\x85), a C1 control character that is certainly not what you

219 meant

220

221 The NEXT LINE character is a bit of an odd case, because it

222 usually won't show up if `fix_encoding` is also being run.

223 \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.

224

225 >>> print(fix_line_breaks(

226 ... "This string is made of two things:\u2029"

227 ... "1. Unicode\u2028"

228 ... "2. Spite"

229 ... ))

230 This string is made of two things:

231 1. Unicode

232 2. Spite

233

234 For further testing and examples, let's define a function to make sure

235 we can see the control characters in their escaped form:

236

237 >>> def eprint(text):

238 ... print(text.encode('unicode-escape').decode('ascii'))

239

240 >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))

241 Content-type: text/plain\n\nHi.

242

243 >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))

244 This is how Microsoft \n trolls Mac users

245

246 >>> eprint(fix_line_breaks("What is this \x85 I don't even"))

247 What is this \n I don't even

248 """

249 return (

250 text.replace("\r\n", "\n")

251 .replace("\r", "\n")

252 .replace("\u2028", "\n")

253 .replace("\u2029", "\n")

254 .replace("\u0085", "\n")

255 )

256

257

258SURROGATE_RE = re.compile("[\ud800-\udfff]")

259SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")

260

261

262def convert_surrogate_pair(match: Match[str]) -> str:

263 """

264 Convert a surrogate pair to the single codepoint it represents.

265

266 This implements the formula described at:

267 http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates

268 """

269 pair = match.group(0)

270 codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)

271 return chr(codept)

272

273

274def fix_surrogates(text: str) -> str:

275 """

276 Replace 16-bit surrogate codepoints with the characters they represent

277 (when properly paired), or with \ufffd otherwise.

278

279 >>> high_surrogate = chr(0xd83d)

280 >>> low_surrogate = chr(0xdca9)

281 >>> print(fix_surrogates(high_surrogate + low_surrogate))

282 💩

283 >>> print(fix_surrogates(low_surrogate + high_surrogate))

284 ��

285

286 The above doctest had to be very carefully written, because even putting

287 the Unicode escapes of the surrogates in the docstring was causing

288 various tools to fail, which I think just goes to show why this fixer is

289 necessary.

290 """

291 if SURROGATE_RE.search(text):

292 text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)

293 text = SURROGATE_RE.sub("\ufffd", text)

294 return text

295

296

297def remove_control_chars(text: str) -> str:

298 """

299 Remove various control characters that you probably didn't intend to be in

300 your text. Many of these characters appear in the table of "Characters not

301 suitable for use with markup" at

302 http://www.unicode.org/reports/tr20/tr20-9.html.

303

304 This includes:

305

306 - ASCII control characters, except for the important whitespace characters

307 (U+00 to U+08, U+0B, U+0E to U+1F, U+7F)

308 - Deprecated Arabic control characters (U+206A to U+206F)

309 - Interlinear annotation characters (U+FFF9 to U+FFFB)

310 - The Object Replacement Character (U+FFFC)

311 - The byte order mark (U+FEFF)

312

313 However, these similar characters are left alone:

314

315 - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,

316 U+2028, and U+2029)

317 - C1 control characters (U+80 to U+9F) -- even though they are basically

318 never used intentionally, they are important clues about what mojibake

319 has happened

320 - Control characters that affect glyph rendering, such as joiners and

321 right-to-left marks (U+200C to U+200F, U+202A to U+202E)

322 - Musical notation control characters (U+1D173 to U+1D17A) because wow if

323 you're using those you probably have a good reason

324 - Tag characters, because they are now used in emoji sequences such as

325 "Flag of Wales"

326 """

327 return text.translate(CONTROL_CHARS)

328

329

330def remove_bom(text: str) -> str:

331 r"""

332 Remove a byte-order mark that was accidentally decoded as if it were part

333 of the text.

334

335 >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))

336 Where do you want to go today?

337 """

338 return text.lstrip(chr(0xFEFF))

339

340

341# Define a regex to match valid escape sequences in Python string literals.

342ESCAPE_SEQUENCE_RE = re.compile(

343 r"""

344 ( \\U........ # 8-digit hex escapes

345 | \\u.... # 4-digit hex escapes

346 | \\x.. # 2-digit hex escapes

347 | \\[0-7]{1,3} # Octal escapes

348 | \\N\{[^}]+\} # Unicode characters by name

349 | \\[\\'"abfnrtv] # Single-character escapes

350 )""",

351 re.UNICODE | re.VERBOSE,

352)

353

354

355def decode_escapes(text: str) -> str:

356 r"""

357 Decode backslashed escape sequences, including \\x, \\u, and \\U character

358 references, even in the presence of other Unicode.

359

360 This function has to be called specifically. It's not run automatically by

361 ftfy, because escaped text is not necessarily a mistake, and there is no

362 way to distinguish when it is.

363

364 This is what Python's "string-escape" and "unicode-escape" codecs were

365 meant to do, but in contrast, this actually works. It will decode the

366 string exactly the same way that the Python interpreter decodes its string

367 literals.

368

369 >>> factoid = '\\u20a1 is the currency symbol for the colón.'

370 >>> print(factoid[1:])

371 u20a1 is the currency symbol for the colón.

372 >>> print(decode_escapes(factoid))

373 ₡ is the currency symbol for the colón.

374

375 Even though Python itself can read string literals with a combination of

376 escapes and literal Unicode -- you're looking at one right now -- the

377 "unicode-escape" codec doesn't work on literal Unicode. (See

378 http://stackoverflow.com/a/24519338/773754 for more details.)

379

380 Instead, this function searches for just the parts of a string that

381 represent escape sequences, and decodes them, leaving the rest alone. All

382 valid escape sequences are made of ASCII characters, and this allows

383 "unicode-escape" to work correctly.

384 """

385

386 def decode_match(match: Match[str]) -> str:

387 "Given a regex match, decode the escape sequence it contains."

388 return codecs.decode(match.group(0), "unicode-escape")

389

390 return ESCAPE_SEQUENCE_RE.sub(decode_match, text)

391

392

393# This regex implements an exception to restore_byte_a0, so we can decode the

394# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla

395# mode".

396#

397# If byte C3 appears with a single space after it -- most commonly this shows

398# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping

399# the space. Without this change, we would decode "à" as the start of the next

400# word, such as "àla". It's almost always intended to be a separate word, as in

401# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces

402# get coalesced into "Ã la".

403#

404# We make exceptions for the Portuguese words "às", "àquele", "àquela",

405# "àquilo" and their plurals -- these are contractions of, for example, "a

406# aquele" and are very common. Note that the final letter is important to

407# distinguish this case from French "à quel point".

408#

409# Other instances in Portuguese, such as "àfrica", seem to be typos (intended

410# to be "África" with the accent in the other direction).

411#

412# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that

413# contain it will end up with inserted spaces. We can't do the right thing with

414# every word. The cost is that the mojibake text "fÃ cil" will be interpreted as

415# "fà cil", not "fàcil".

416A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")

417

418

419def restore_byte_a0(byts: bytes) -> bytes:

420 """

421 Some mojibake has been additionally altered by a process that said "hmm,

422 byte A0, that's basically a space!" and replaced it with an ASCII space.

423 When the A0 is part of a sequence that we intend to decode as UTF-8,

424 changing byte A0 to 20 would make it fail to decode.

425

426 This process finds sequences that would convincingly decode as UTF-8 if

427 byte 20 were changed to A0, and puts back the A0. For the purpose of

428 deciding whether this is a good idea, this step gets a cost of twice

429 the number of bytes that are changed.

430

431 This is used as a step within `fix_encoding`.

432 """

433 byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)

434

435 def replacement(match: Match[bytes]) -> bytes:

436 "The function to apply when this regex matches."

437 return match.group(0).replace(b"\x20", b"\xa0")

438

439 return ALTERED_UTF8_RE.sub(replacement, byts)

440

441

442def replace_lossy_sequences(byts: bytes) -> bytes:

443 """

444 This function identifies sequences where information has been lost in

445 a "sloppy" codec, indicated by byte 1A, and if they would otherwise look

446 like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.

447

448 A further explanation:

449

450 ftfy can now fix text in a few cases that it would previously fix

451 incompletely, because of the fact that it can't successfully apply the fix

452 to the entire string. A very common case of this is when characters have

453 been erroneously decoded as windows-1252, but instead of the "sloppy"

454 windows-1252 that passes through unassigned bytes, the unassigned bytes get

455 turned into U+FFFD (�), so we can't tell what they were.

456

457 This most commonly happens with curly quotation marks that appear

458 ``â€œ like this â€�``.

459

460 We can do better by building on ftfy's "sloppy codecs" to let them handle

461 less-sloppy but more-lossy text. When they encounter the character ``�``,

462 instead of refusing to encode it, they encode it as byte 1A -- an

463 ASCII control code called SUBSTITUTE that once was meant for about the same

464 purpose. We can then apply a fixer that looks for UTF-8 sequences where

465 some continuation bytes have been replaced by byte 1A, and decode the whole

466 sequence as �; if that doesn't work, it'll just turn the byte back into �

467 itself.

468

469 As a result, the above text ``â€œ like this â€�`` will decode as

470 ``“ like this �``.

471

472 If U+1A was actually in the original string, then the sloppy codecs will

473 not be used, and this function will not be run, so your weird control

474 character will be left alone but wacky fixes like this won't be possible.

475

476 This is used as a transcoder within `fix_encoding`.

477 """

478 return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts)

479

480

481def decode_inconsistent_utf8(text: str) -> str:

482 """

483 Sometimes, text from one encoding ends up embedded within text from a

484 different one. This is common enough that we need to be able to fix it.

485

486 This is used as a transcoder within `fix_encoding`.

487 """

488

489 def fix_embedded_mojibake(match: Match[str]) -> str:

490 substr = match.group(0)

491

492 # Require the match to be shorter, so that this doesn't recurse infinitely

493 if len(substr) < len(text) and is_bad(substr):

494 return ftfy.fix_encoding(substr)

495 else:

496 return substr

497

498 return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)

499

500

501def _c1_fixer(match: Match[str]) -> str:

502 return match.group(0).encode("latin-1").decode("sloppy-windows-1252")

503

504

505def fix_c1_controls(text: str) -> str:

506 """

507 If text still contains C1 control characters, treat them as their

508 Windows-1252 equivalents. This matches what Web browsers do.

509 """

510 return C1_CONTROL_RE.sub(_c1_fixer, text)