Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/__init__.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

237 statements  

1""" 

2ftfy: fixes text for you 

3 

4This is a module for making text less broken. See the `fix_text` function 

5for more information. 

6""" 

7 

8from __future__ import annotations 

9 

10import unicodedata 

11import warnings 

12from typing import ( 

13 TYPE_CHECKING, 

14 Any, 

15 BinaryIO, 

16 Callable, 

17 Literal, 

18 NamedTuple, 

19 TextIO, 

20 cast, 

21) 

22 

23from ftfy import bad_codecs, chardata, fixes 

24from ftfy.badness import is_bad 

25from ftfy.formatting import display_ljust 

26 

27if TYPE_CHECKING: 

28 from collections.abc import Iterator 

29 

30__version__ = "6.3.1" 

31 

32 

33# Though this function does nothing, it lets linters know that we're using 

34# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more. 

35bad_codecs.ok() 

36 

37 

38class ExplanationStep(NamedTuple): 

39 """ 

40 A step in an ExplainedText, explaining how to decode text. 

41 

42 The possible actions are: 

43 

44 - "encode": take in a string and encode it as bytes, with the given encoding 

45 - "decode": take in bytes and decode them as a string, with the given encoding 

46 - "transcode": convert bytes to bytes with a particular named function 

47 - "apply": convert str to str with a particular named function 

48 

49 The `parameter` is the name of the encoding or function to use. If it's a 

50 function, it must appear in the FIXERS dictionary. 

51 """ 

52 

53 action: str 

54 parameter: str 

55 

56 def __repr__(self) -> str: 

57 """ 

58 Get the string representation of an ExplanationStep. We output the 

59 representation of the equivalent tuple, for simplicity. 

60 """ 

61 return repr(tuple(self)) 

62 

63 

64class ExplainedText(NamedTuple): 

65 """ 

66 The return type from ftfy's functions that provide an "explanation" of which 

67 steps it applied to fix the text, such as :func:`fix_and_explain()`. 

68 

69 When the 'explain' option is disabled, these functions return the same 

70 type, but the `explanation` will be None. 

71 """ 

72 

73 text: str 

74 explanation: list[ExplanationStep] | None 

75 

76 

77# Functions that can be applied using `apply_plan`. 

78FIXERS: dict[str, Callable] = { # type: ignore[type-arg] 

79 "unescape_html": fixes.unescape_html, 

80 "remove_terminal_escapes": fixes.remove_terminal_escapes, 

81 "restore_byte_a0": fixes.restore_byte_a0, 

82 "replace_lossy_sequences": fixes.replace_lossy_sequences, 

83 "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, 

84 "fix_c1_controls": fixes.fix_c1_controls, 

85 "fix_latin_ligatures": fixes.fix_latin_ligatures, 

86 "fix_character_width": fixes.fix_character_width, 

87 "uncurl_quotes": fixes.uncurl_quotes, 

88 "fix_line_breaks": fixes.fix_line_breaks, 

89 "fix_surrogates": fixes.fix_surrogates, 

90 "remove_control_chars": fixes.remove_control_chars, 

91} 

92 

93 

94class TextFixerConfig(NamedTuple): 

95 r""" 

96 A TextFixerConfig object stores configuration options for ftfy. 

97 

98 It's implemented as a namedtuple with defaults, so you can instantiate 

99 it by providing the values to change from their defaults as keyword arguments. 

100 For example, to disable 'unescape_html' and keep the rest of the defaults:: 

101 

102 TextFixerConfig(unescape_html=False) 

103 

104 Here are the options and their default values: 

105 

106 - `unescape_html`: "auto" 

107 

108 Configures whether to replace HTML entities such as & with the character 

109 they represent. "auto" says to do this by default, but disable it when a 

110 literal < character appears, indicating that the input is actual HTML and 

111 entities should be preserved. The value can be True, to always enable this 

112 fixer, or False, to always disable it. 

113 

114 - `remove_terminal_escapes`: True 

115 

116 Removes "ANSI" terminal escapes, such as for changing the color of text in a 

117 terminal window. 

118 

119 - `fix_encoding`: True 

120 

121 Detect mojibake and attempt to fix it by decoding the text in a different 

122 encoding standard. 

123 

124 The following four options affect `fix_encoding` works, and do nothing if 

125 `fix_encoding` is False: 

126 

127 - `restore_byte_a0`: True 

128 

129 Allow a literal space (U+20) to be interpreted as a non-breaking space 

130 (U+A0) when that would make it part of a fixable mojibake string. 

131 

132 Because spaces are very common characters, this could lead to false 

133 positives, but we try to apply it only when there's strong evidence for 

134 mojibake. Disabling `restore_byte_a0` is safer from false positives, 

135 but creates false negatives. 

136 

137 - `replace_lossy_sequences`: True 

138 

139 Detect mojibake that has been partially replaced by the characters 

140 '�' or '?'. If the mojibake could be decoded otherwise, replace the 

141 detected sequence with '�'. 

142 

143 - `decode_inconsistent_utf8`: True 

144 

145 When we see sequences that distinctly look like UTF-8 mojibake, but 

146 there's no consistent way to reinterpret the string in a new encoding, 

147 replace the mojibake with the appropriate UTF-8 characters anyway. 

148 

149 This helps to decode strings that are concatenated from different 

150 encodings. 

151 

152 - `fix_c1_controls`: True 

153 

154 Replace C1 control characters (the useless characters U+80 - U+9B that 

155 come from Latin-1) with their Windows-1252 equivalents, like HTML5 does, 

156 even if the whole string doesn't decode as Latin-1. 

157 

158 - `fix_latin_ligatures`: True 

159 

160 Replace common Latin-alphabet ligatures, such as ``fi``, with the 

161 letters they're made of. 

162 

163 - `fix_character_width`: True 

164 

165 Replace fullwidth Latin characters and halfwidth Katakana with 

166 their more standard widths. 

167 

168 - `uncurl_quotes`: True 

169 

170 Replace curly quotes with straight quotes. 

171 

172 - `fix_line_breaks`: True 

173 

174 Replace various forms of line breaks with the standard Unix line 

175 break, ``\n``. 

176 

177 - `fix_surrogates`: True 

178 

179 Replace sequences of UTF-16 surrogate codepoints with the character 

180 they were meant to encode. This fixes text that was decoded with the 

181 obsolete UCS-2 standard, and allows it to support high-numbered 

182 codepoints such as emoji. 

183 

184 - `remove_control_chars`: True 

185 

186 Remove certain control characters that have no displayed effect on text. 

187 

188 - `normalization`: "NFC" 

189 

190 Choose what kind of Unicode normalization is applied. Usually, we apply 

191 NFC normalization, so that letters followed by combining characters become 

192 single combined characters. 

193 

194 Changing this to "NFKC" applies more compatibility conversions, such as 

195 replacing the 'micro sign' with a standard Greek lowercase mu, which looks 

196 identical. However, some NFKC normalizations change the meaning of text, 

197 such as converting "10³" to "103". 

198 

199 `normalization` can be None, to apply no normalization. 

200 

201 - `max_decode_length`: 1_000_000 

202 

203 The maximum size of "segment" that ftfy will try to fix all at once. 

204 

205 - `explain`: True 

206 

207 Whether to compute 'explanations', lists describing what ftfy changed. 

208 When this is False, the explanation will be None, and the code that 

209 builds the explanation will be skipped, possibly saving time. 

210 

211 Functions that accept TextFixerConfig and don't return an explanation 

212 will automatically set `explain` to False. 

213 """ 

214 

215 unescape_html: str | bool = "auto" 

216 remove_terminal_escapes: bool = True 

217 fix_encoding: bool = True 

218 restore_byte_a0: bool = True 

219 replace_lossy_sequences: bool = True 

220 decode_inconsistent_utf8: bool = True 

221 fix_c1_controls: bool = True 

222 fix_latin_ligatures: bool = True 

223 fix_character_width: bool = True 

224 uncurl_quotes: bool = True 

225 fix_line_breaks: bool = True 

226 fix_surrogates: bool = True 

227 remove_control_chars: bool = True 

228 normalization: Literal["NFC", "NFD", "NFKC", "NFKD"] | None = "NFC" 

229 max_decode_length: int = 1000000 

230 explain: bool = True 

231 

232 

233def _config_from_kwargs(config: TextFixerConfig, kwargs: dict[str, Any]) -> TextFixerConfig: 

234 """ 

235 Handle parameters provided as keyword arguments to ftfy's top-level 

236 functions, converting them into a TextFixerConfig. 

237 """ 

238 if "fix_entities" in kwargs: 

239 warnings.warn( 

240 "`fix_entities` has been renamed to `unescape_html`", 

241 DeprecationWarning, 

242 stacklevel=2, 

243 ) 

244 kwargs = kwargs.copy() 

245 kwargs["unescape_html"] = kwargs["fix_entities"] 

246 del kwargs["fix_entities"] 

247 return config._replace(**kwargs) 

248 

249 

250BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. 

251 

252ftfy is designed to fix problems with text. Treating bytes like they're 

253interchangeable with Unicode text is usually something that introduces 

254problems with text. 

255 

256You should first decode these bytes from the encoding you think they're in. 

257If you're not sure what encoding they're in: 

258 

259- First, try to find out. 'utf-8' is a good assumption. 

260- If the encoding is simply unknowable, try running your bytes through 

261 ftfy.guess_bytes. As the name implies, this may not always be accurate. 

262 

263For more information on the distinction between bytes and text, read the 

264Python Unicode HOWTO: 

265 

266 http://docs.python.org/3/howto/unicode.html 

267""" 

268 

269 

270def _try_fix( 

271 fixer_name: str, 

272 text: str, 

273 config: TextFixerConfig, 

274 steps: list[ExplanationStep] | None, 

275) -> str: 

276 """ 

277 A helper function used across several 'fixer' steps, deciding whether to 

278 apply the fix and whether to record the fix in `steps`. 

279 """ 

280 if getattr(config, fixer_name): 

281 fixer = FIXERS[fixer_name] 

282 fixed = fixer(text) 

283 if steps is not None and fixed != text: 

284 steps.append(ExplanationStep("apply", fixer_name)) 

285 return cast(str, fixed) 

286 

287 return text 

288 

289 

290def fix_text(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: 

291 r""" 

292 Given Unicode text as input, fix inconsistencies and glitches in it, 

293 such as mojibake (text that was decoded in the wrong encoding). 

294 

295 Let's start with some examples: 

296 

297 >>> fix_text('✔ No problems') 

298 '✔ No problems' 

299 

300 >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;")) 

301 ¯\_(ツ)_/¯ 

302 

303 >>> fix_text('Broken text&hellip; it&#x2019;s flubberific!') 

304 "Broken text... it's flubberific!" 

305 

306 >>> fix_text('LOUD NOISES') 

307 'LOUD NOISES' 

308 

309 ftfy applies a number of different fixes to the text, and can accept 

310 configuration to select which fixes to apply. 

311 

312 The configuration takes the form of a :class:`TextFixerConfig` object, 

313 and you can see a description of the options in that class's docstring 

314 or in the full documentation at ftfy.readthedocs.org. 

315 

316 For convenience and backward compatibility, the configuration can also 

317 take the form of keyword arguments, which will set the equivalently-named 

318 fields of the TextFixerConfig object. 

319 

320 For example, here are two ways to fix text but skip the "uncurl_quotes" 

321 step:: 

322 

323 fix_text(text, TextFixerConfig(uncurl_quotes=False)) 

324 fix_text(text, uncurl_quotes=False) 

325 

326 This function fixes text in independent segments, which are usually lines 

327 of text, or arbitrarily broken up every 1 million codepoints (configurable 

328 with `config.max_decode_length`) if there aren't enough line breaks. The 

329 bound on segment lengths helps to avoid unbounded slowdowns. 

330 

331 ftfy can also provide an 'explanation', a list of transformations it applied 

332 to the text that would fix more text like it. This function doesn't provide 

333 explanations (because there may be different fixes for different segments 

334 of text). 

335 

336 To get an explanation, use the :func:`fix_and_explain()` function, which 

337 fixes the string in one segment and explains what it fixed. 

338 """ 

339 

340 if config is None: 

341 config = TextFixerConfig(explain=False) 

342 config = _config_from_kwargs(config, kwargs) 

343 if isinstance(text, bytes): 

344 raise UnicodeError(BYTES_ERROR_TEXT) 

345 

346 out = [] 

347 pos = 0 

348 while pos < len(text): 

349 textbreak = text.find("\n", pos) + 1 

350 if textbreak == 0: 

351 textbreak = len(text) 

352 if (textbreak - pos) > config.max_decode_length: 

353 textbreak = pos + config.max_decode_length 

354 

355 segment = text[pos:textbreak] 

356 if config.unescape_html == "auto" and "<" in segment: 

357 config = config._replace(unescape_html=False) 

358 fixed_segment, _ = fix_and_explain(segment, config) 

359 out.append(fixed_segment) 

360 pos = textbreak 

361 return "".join(out) 

362 

363 

364def fix_and_explain( 

365 text: str, config: TextFixerConfig | None = None, **kwargs: Any 

366) -> ExplainedText: 

367 """ 

368 Fix text as a single segment, returning the fixed text and an explanation 

369 of what was fixed. 

370 

371 The explanation is a list of steps that can be applied with 

372 :func:`apply_plan`, or if config.explain is False, it will be None. 

373 """ 

374 if config is None: 

375 config = TextFixerConfig() 

376 if isinstance(text, bytes): 

377 raise UnicodeError(BYTES_ERROR_TEXT) 

378 config = _config_from_kwargs(config, kwargs) 

379 

380 if config.unescape_html == "auto" and "<" in text: 

381 config = config._replace(unescape_html=False) 

382 

383 if config.explain: 

384 steps: list[ExplanationStep] | None = [] 

385 else: 

386 # If explanations aren't desired, `steps` will be None 

387 steps = None 

388 

389 while True: 

390 origtext = text 

391 

392 text = _try_fix("unescape_html", text, config, steps) 

393 

394 if config.fix_encoding: 

395 if steps is None: 

396 text = fix_encoding(text) 

397 else: 

398 text, encoding_steps = fix_encoding_and_explain(text, config) 

399 if encoding_steps is not None: 

400 steps.extend(encoding_steps) 

401 

402 for fixer in [ 

403 "fix_c1_controls", 

404 "fix_latin_ligatures", 

405 "fix_character_width", 

406 "uncurl_quotes", 

407 "fix_line_breaks", 

408 "fix_surrogates", 

409 "remove_terminal_escapes", 

410 "remove_control_chars", 

411 ]: 

412 text = _try_fix(fixer, text, config, steps) 

413 

414 if config.normalization is not None: 

415 fixed = unicodedata.normalize(config.normalization, text) 

416 if steps is not None and fixed != text: 

417 steps.append(ExplanationStep("normalize", config.normalization)) 

418 text = fixed 

419 

420 if text == origtext: 

421 return ExplainedText(text, steps) 

422 

423 

424def fix_encoding_and_explain( 

425 text: str, config: TextFixerConfig | None = None, **kwargs: Any 

426) -> ExplainedText: 

427 """ 

428 Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed 

429 text and a list explaining what was fixed. 

430 

431 This includes fixing text by encoding and decoding it in different encodings, 

432 as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`, 

433 `decode_inconsistent_utf8`, and `fix_c1_controls`. 

434 

435 Examples:: 

436 

437 >>> fix_encoding_and_explain("só") 

438 ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')]) 

439 

440 >>> result = fix_encoding_and_explain("voilà le travail") 

441 >>> result.text 

442 'voilà le travail' 

443 >>> result.explanation 

444 [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')] 

445 

446 """ 

447 if config is None: 

448 config = TextFixerConfig() 

449 if isinstance(text, bytes): 

450 raise UnicodeError(BYTES_ERROR_TEXT) 

451 config = _config_from_kwargs(config, kwargs) 

452 

453 if not config.fix_encoding: 

454 # A weird trivial case: we're asked to fix the encoding, but skip 

455 # fixing the encoding 

456 return ExplainedText(text, []) 

457 

458 plan_so_far: list[ExplanationStep] = [] 

459 while True: 

460 prevtext = text 

461 text, plan = _fix_encoding_one_step_and_explain(text, config) 

462 if plan is not None: 

463 plan_so_far.extend(plan) 

464 if text == prevtext: 

465 return ExplainedText(text, plan_so_far) 

466 

467 

468def _fix_encoding_one_step_and_explain(text: str, config: TextFixerConfig) -> ExplainedText: 

469 """ 

470 Perform one step of fixing the encoding of text. 

471 """ 

472 if config is None: 

473 config = TextFixerConfig() 

474 

475 if len(text) == 0: 

476 return ExplainedText(text, []) 

477 

478 # The first plan is to return ASCII text unchanged, as well as text 

479 # that doesn't look like it contains mojibake 

480 if chardata.possible_encoding(text, "ascii") or not is_bad(text): 

481 return ExplainedText(text, []) 

482 

483 # As we go through the next step, remember the possible encodings 

484 # that we encounter but don't successfully fix yet. We may need them 

485 # later. 

486 possible_1byte_encodings = [] 

487 

488 # Suppose the text was supposed to be UTF-8, but it was decoded using 

489 # a single-byte encoding instead. When these cases can be fixed, they 

490 # are usually the correct thing to do, so try them next. 

491 for encoding in chardata.CHARMAP_ENCODINGS: 

492 if chardata.possible_encoding(text, encoding): 

493 possible_1byte_encodings.append(encoding) 

494 encoded_bytes = text.encode(encoding) 

495 encode_step = ExplanationStep("encode", encoding) 

496 transcode_steps = [] 

497 

498 # Now, find out if it's UTF-8 (or close enough). Otherwise, 

499 # remember the encoding for later. 

500 try: 

501 decoding = "utf-8" 

502 # Check encoded_bytes for sequences that would be UTF-8, 

503 # except they have b' ' where b'\xa0' would belong. 

504 # 

505 # Don't do this in the macroman encoding, where it would match 

506 # an en dash followed by a space, leading to false positives. 

507 if ( 

508 config.restore_byte_a0 

509 and encoding != "macroman" 

510 and chardata.ALTERED_UTF8_RE.search(encoded_bytes) 

511 ): 

512 replaced_bytes = fixes.restore_byte_a0(encoded_bytes) 

513 if replaced_bytes != encoded_bytes: 

514 transcode_steps.append(ExplanationStep("transcode", "restore_byte_a0")) 

515 encoded_bytes = replaced_bytes 

516 

517 # Replace sequences where information has been lost 

518 if config.replace_lossy_sequences and encoding.startswith("sloppy"): 

519 replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) 

520 if replaced_bytes != encoded_bytes: 

521 transcode_steps.append( 

522 ExplanationStep("transcode", "replace_lossy_sequences") 

523 ) 

524 encoded_bytes = replaced_bytes 

525 

526 if 0xED in encoded_bytes or 0xC0 in encoded_bytes: 

527 decoding = "utf-8-variants" 

528 

529 decode_step = ExplanationStep("decode", decoding) 

530 steps = [encode_step, *transcode_steps, decode_step] 

531 fixed = encoded_bytes.decode(decoding) 

532 return ExplainedText(fixed, steps) 

533 

534 except UnicodeDecodeError: 

535 pass 

536 

537 # Look for a-hat-euro sequences that remain, and fix them in isolation. 

538 if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): 

539 steps = [ExplanationStep("apply", "decode_inconsistent_utf8")] 

540 fixed = fixes.decode_inconsistent_utf8(text) 

541 if fixed != text: 

542 return ExplainedText(fixed, steps) 

543 

544 # The next most likely case is that this is Latin-1 that was intended to 

545 # be read as Windows-1252, because those two encodings in particular are 

546 # easily confused. 

547 if "latin-1" in possible_1byte_encodings: 

548 if "windows-1252" in possible_1byte_encodings: 

549 # This text is in the intersection of Latin-1 and 

550 # Windows-1252, so it's probably legit. 

551 return ExplainedText(text, []) 

552 else: 

553 # Otherwise, it means we have characters that are in Latin-1 but 

554 # not in Windows-1252. Those are C1 control characters. Nobody 

555 # wants those. Assume they were meant to be Windows-1252. 

556 try: 

557 fixed = text.encode("latin-1").decode("windows-1252") 

558 if fixed != text: 

559 steps = [ 

560 ExplanationStep("encode", "latin-1"), 

561 ExplanationStep("decode", "windows-1252"), 

562 ] 

563 return ExplainedText(fixed, steps) 

564 except UnicodeDecodeError: 

565 pass 

566 

567 # Fix individual characters of Latin-1 with a less satisfying explanation 

568 if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): 

569 steps = [ExplanationStep("transcode", "fix_c1_controls")] 

570 fixed = fixes.fix_c1_controls(text) 

571 return ExplainedText(fixed, steps) 

572 

573 # The cases that remain are mixups between two different single-byte 

574 # encodings, and not the common case of Latin-1 vs. Windows-1252. 

575 # 

576 # With the new heuristic in 6.0, it's possible that we're closer to solving 

577 # these in some cases. It would require a lot of testing and tuning, though. 

578 # For now, we leave the text unchanged in these cases. 

579 return ExplainedText(text, []) 

580 

581 

582def fix_encoding(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: 

583 """ 

584 Apply just the encoding-fixing steps of ftfy to this text. Returns the 

585 fixed text, discarding the explanation. 

586 

587 >>> fix_encoding("ó") 

588 'ó' 

589 >>> fix_encoding("&ATILDE;&SUP3;") 

590 '&ATILDE;&SUP3;' 

591 """ 

592 if config is None: 

593 config = TextFixerConfig(explain=False) 

594 config = _config_from_kwargs(config, kwargs) 

595 fixed, _explan = fix_encoding_and_explain(text, config) 

596 return fixed 

597 

598 

599# Some alternate names for the main functions 

600ftfy = fix_text 

601 

602 

603def fix_text_segment(text: str, config: TextFixerConfig | None = None, **kwargs: Any) -> str: 

604 """ 

605 Fix text as a single segment, with a consistent sequence of steps that 

606 are applied to fix the text. Discard the explanation. 

607 """ 

608 if config is None: 

609 config = TextFixerConfig(explain=False) 

610 config = _config_from_kwargs(config, kwargs) 

611 fixed, _explan = fix_and_explain(text, config) 

612 return fixed 

613 

614 

615def fix_file( 

616 input_file: TextIO | BinaryIO, 

617 encoding: str | None = None, 

618 config: TextFixerConfig | None = None, 

619 **kwargs: Any, 

620) -> Iterator[str]: 

621 """ 

622 Fix text that is found in a file. 

623 

624 If the file is being read as Unicode text, use that. If it's being read as 

625 bytes, then we hope an encoding was supplied. If not, unfortunately, we 

626 have to guess what encoding it is. We'll try a few common encodings, but we 

627 make no promises. See the `guess_bytes` function for how this is done. 

628 

629 The output is a stream of fixed lines of text. 

630 """ 

631 if config is None: 

632 config = TextFixerConfig() 

633 config = _config_from_kwargs(config, kwargs) 

634 

635 for line in input_file: 

636 if isinstance(line, bytes): 

637 if encoding is None: 

638 line, encoding = guess_bytes(line) 

639 else: 

640 line = line.decode(encoding) 

641 if config.unescape_html == "auto" and "<" in line: 

642 config = config._replace(unescape_html=False) 

643 

644 fixed_line, _explan = fix_and_explain(line, config) 

645 yield fixed_line 

646 

647 

648def guess_bytes(bstring: bytes) -> tuple[str, str]: 

649 """ 

650 NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy 

651 is not designed to be an encoding detector. 

652 

653 In the unfortunate situation that you have some bytes in an unknown 

654 encoding, ftfy can guess a reasonable strategy for decoding them, by trying 

655 a few common encodings that can be distinguished from each other. 

656 

657 Unlike the rest of ftfy, this may not be accurate, and it may *create* 

658 Unicode problems instead of solving them! 

659 

660 The encodings we try here are: 

661 

662 - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks 

663 like nothing else 

664 - UTF-8, because it's the global standard, which has been used by a 

665 majority of the Web since 2008 

666 - "utf-8-variants", or buggy implementations of UTF-8 

667 - MacRoman, because Microsoft Office thinks it's still a thing, and it 

668 can be distinguished by its line breaks. (If there are no line breaks in 

669 the string, though, you're out of luck.) 

670 - "sloppy-windows-1252", the Latin-1-like encoding that is the most common 

671 single-byte encoding. 

672 """ 

673 if isinstance(bstring, str): 

674 msg = ( 

675 "This string was already decoded as Unicode. You should pass " 

676 "bytes to guess_bytes, not Unicode." 

677 ) 

678 raise UnicodeError(msg) 

679 

680 if bstring.startswith((b"\xfe\xff", b"\xff\xfe")): 

681 return bstring.decode("utf-16"), "utf-16" 

682 

683 byteset = set(bstring) 

684 try: 

685 if 0xED in byteset or 0xC0 in byteset: 

686 # Byte 0xed can be used to encode a range of codepoints that 

687 # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, 

688 # so when we see 0xed, it's very likely we're being asked to 

689 # decode CESU-8, the variant that encodes UTF-16 surrogates 

690 # instead of the original characters themselves. 

691 # 

692 # This will occasionally trigger on standard UTF-8, as there 

693 # are some Korean characters that also use byte 0xed, but that's 

694 # not harmful because standard UTF-8 characters will decode the 

695 # same way in our 'utf-8-variants' codec. 

696 # 

697 # Byte 0xc0 is impossible because, numerically, it would only 

698 # encode characters lower than U+0040. Those already have 

699 # single-byte representations, and UTF-8 requires using the 

700 # shortest possible representation. However, Java hides the null 

701 # codepoint, U+0000, in a non-standard longer representation -- it 

702 # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00 

703 # will never appear in the encoded bytes. 

704 # 

705 # The 'utf-8-variants' decoder can handle both of these cases, as 

706 # well as standard UTF-8, at the cost of a bit of speed. 

707 return bstring.decode("utf-8-variants"), "utf-8-variants" 

708 else: 

709 return bstring.decode("utf-8"), "utf-8" 

710 except UnicodeDecodeError: 

711 pass 

712 

713 if 0x0D in byteset and 0x0A not in byteset: 

714 # Files that contain CR and not LF are likely to be MacRoman. 

715 return bstring.decode("macroman"), "macroman" 

716 

717 return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252" 

718 

719 

720def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: 

721 """ 

722 Apply a plan for fixing the encoding of text. 

723 

724 The plan is a list of tuples of the form (operation, arg). 

725 

726 `operation` is one of: 

727 

728 - `'encode'`: convert a string to bytes, using `arg` as the encoding 

729 - `'decode'`: convert bytes to a string, using `arg` as the encoding 

730 - `'transcode'`: convert bytes to bytes, using the function named `arg` 

731 - `'apply'`: convert a string to a string, using the function named `arg` 

732 

733 The functions that can be applied by 'transcode' and 'apply' are 

734 specifically those that appear in the dictionary named `FIXERS`. They 

735 can also can be imported from the `ftfy.fixes` module. 

736 

737 Example:: 

738 

739 >>> mojibake = "schön" 

740 >>> text, plan = fix_and_explain(mojibake) 

741 >>> apply_plan(mojibake, plan) 

742 'schön' 

743 """ 

744 obj = text 

745 for operation, encoding in plan: 

746 if operation == "encode": 

747 obj = obj.encode(encoding) # type: ignore 

748 elif operation == "decode": 

749 obj = obj.decode(encoding) # type: ignore 

750 elif operation in ("transcode", "apply"): 

751 if encoding in FIXERS: 

752 obj = FIXERS[encoding](obj) 

753 else: 

754 msg = f"Unknown function to apply: {encoding}" 

755 raise ValueError(msg) 

756 else: 

757 msg = f"Unknown plan step: {operation}" 

758 raise ValueError(msg) 

759 

760 return obj 

761 

762 

763def explain_unicode(text: str) -> None: 

764 """ 

765 A utility method that's useful for debugging mysterious Unicode. 

766 

767 It breaks down a string, showing you for each codepoint its number in 

768 hexadecimal, its glyph, its category in the Unicode standard, and its name 

769 in the Unicode standard. 

770 

771 >>> explain_unicode('(╯°□°)╯︵ ┻━┻') 

772 U+0028 ( [Ps] LEFT PARENTHESIS 

773 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT 

774 U+00B0 ° [So] DEGREE SIGN 

775 U+25A1 □ [So] WHITE SQUARE 

776 U+00B0 ° [So] DEGREE SIGN 

777 U+0029 ) [Pe] RIGHT PARENTHESIS 

778 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT 

779 U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS 

780 U+0020 [Zs] SPACE 

781 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL 

782 U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL 

783 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL 

784 """ 

785 for char in text: 

786 if char.isprintable(): 

787 display = char 

788 else: 

789 display = char.encode("unicode-escape").decode("ascii") 

790 print( 

791 "U+{code:04X} {display} [{category}] {name}".format( 

792 display=display_ljust(display, 7), 

793 code=ord(char), 

794 category=unicodedata.category(char), 

795 name=unicodedata.name(char, "<unknown>"), 

796 ) 

797 )