Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/__init__.py: 85%

231 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:40 +0000

1""" 

2ftfy: fixes text for you 

3 

4This is a module for making text less broken. See the `fix_text` function 

5for more information. 

6""" 

7 

8import unicodedata 

9import warnings 

10from typing import ( 

11 Any, 

12 Dict, 

13 Iterator, 

14 List, 

15 NamedTuple, 

16 Optional, 

17 TextIO, 

18 Tuple, 

19 Union, 

20 cast, 

21 no_type_check, 

22) 

23 

24from ftfy import bad_codecs 

25from ftfy import chardata, fixes 

26from ftfy.badness import is_bad 

27from ftfy.formatting import display_ljust 

28 

29__version__ = "6.1.2" 

30 

31 

32# Though this function does nothing, it lets linters know that we're using 

33# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more. 

34bad_codecs.ok() 

35 

36 

37class ExplanationStep(NamedTuple): 

38 """ 

39 A step in an ExplainedText, explaining how to decode text. 

40 

41 The possible actions are: 

42 

43 - "encode": take in a string and encode it as bytes, with the given encoding 

44 - "decode": take in bytes and decode them as a string, with the given encoding 

45 - "transcode": convert bytes to bytes with a particular named function 

46 - "apply": convert str to str with a particular named function 

47 

48 The `parameter` is the name of the encoding or function to use. If it's a 

49 function, it must appear in the FIXERS dictionary. 

50 """ 

51 

52 action: str 

53 parameter: str 

54 

55 def __repr__(self) -> str: 

56 """ 

57 Get the string representation of an ExplanationStep. We output the 

58 representation of the equivalent tuple, for simplicity. 

59 """ 

60 return repr(tuple(self)) 

61 

62 

63class ExplainedText(NamedTuple): 

64 """ 

65 The return type from ftfy's functions that provide an "explanation" of which 

66 steps it applied to fix the text, such as :func:`fix_and_explain()`. 

67 

68 When the 'explain' option is disabled, these functions return the same 

69 type, but the `explanation` will be None. 

70 """ 

71 

72 text: str 

73 explanation: Optional[List[ExplanationStep]] 

74 

75 

76# Functions that can be applied using `apply_plan`. 

77FIXERS = { 

78 "unescape_html": fixes.unescape_html, 

79 "remove_terminal_escapes": fixes.remove_terminal_escapes, 

80 "restore_byte_a0": fixes.restore_byte_a0, 

81 "replace_lossy_sequences": fixes.replace_lossy_sequences, 

82 "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, 

83 "fix_c1_controls": fixes.fix_c1_controls, 

84 "fix_latin_ligatures": fixes.fix_latin_ligatures, 

85 "fix_character_width": fixes.fix_character_width, 

86 "uncurl_quotes": fixes.uncurl_quotes, 

87 "fix_line_breaks": fixes.fix_line_breaks, 

88 "fix_surrogates": fixes.fix_surrogates, 

89 "remove_control_chars": fixes.remove_control_chars, 

90} 

91 

92 

93class TextFixerConfig(NamedTuple): 

94 r""" 

95 A TextFixerConfig object stores configuration options for ftfy. 

96 

97 It's implemented as a namedtuple with defaults, so you can instantiate 

98 it by providing the values to change from their defaults as keyword arguments. 

99 For example, to disable 'unescape_html' and keep the rest of the defaults:: 

100 

101 TextFixerConfig(unescape_html=False) 

102 

103 Here are the options and their default values: 

104 

105 - `unescape_html`: "auto" 

106 

107 Configures whether to replace HTML entities such as & with the character 

108 they represent. "auto" says to do this by default, but disable it when a 

109 literal < character appears, indicating that the input is actual HTML and 

110 entities should be preserved. The value can be True, to always enable this 

111 fixer, or False, to always disable it. 

112 

113 - `remove_terminal_escapes`: True 

114 

115 Removes "ANSI" terminal escapes, such as for changing the color of text in a 

116 terminal window. 

117 

118 - `fix_encoding`: True 

119 

120 Detect mojibake and attempt to fix it by decoding the text in a different 

121 encoding standard. 

122 

123 The following four options affect `fix_encoding` works, and do nothing if 

124 `fix_encoding` is False: 

125 

126 - `restore_byte_a0`: True 

127 

128 Allow a literal space (U+20) to be interpreted as a non-breaking space 

129 (U+A0) when that would make it part of a fixable mojibake string. 

130 

131 Because spaces are very common characters, this could lead to false 

132 positives, but we try to apply it only when there's strong evidence for 

133 mojibake. Disabling `restore_byte_a0` is safer from false positives, 

134 but creates false negatives. 

135 

136 - `replace_lossy_sequences`: True 

137 

138 Detect mojibake that has been partially replaced by the characters 

139 '�' or '?'. If the mojibake could be decoded otherwise, replace the 

140 detected sequence with '�'. 

141 

142 - `decode_inconsistent_utf8`: True 

143 

144 When we see sequences that distinctly look like UTF-8 mojibake, but 

145 there's no consistent way to reinterpret the string in a new encoding, 

146 replace the mojibake with the appropriate UTF-8 characters anyway. 

147 

148 This helps to decode strings that are concatenated from different 

149 encodings. 

150 

151 - `fix_c1_controls`: True 

152 

153 Replace C1 control characters (the useless characters U+80 - U+9B that 

154 come from Latin-1) with their Windows-1252 equivalents, like HTML5 does, 

155 even if the whole string doesn't decode as Latin-1. 

156 

157 - `fix_latin_ligatures`: True 

158 

159 Replace common Latin-alphabet ligatures, such as ``fi``, with the 

160 letters they're made of. 

161 

162 - `fix_character_width`: True 

163 

164 Replace fullwidth Latin characters and halfwidth Katakana with 

165 their more standard widths. 

166 

167 - `uncurl_quotes`: True 

168 

169 Replace curly quotes with straight quotes. 

170 

171 - `fix_line_breaks`: True 

172 

173 Replace various forms of line breaks with the standard Unix line 

174 break, ``\n``. 

175 

176 - `fix_surrogates`: True 

177 

178 Replace sequences of UTF-16 surrogate codepoints with the character 

179 they were meant to encode. This fixes text that was decoded with the 

180 obsolete UCS-2 standard, and allows it to support high-numbered 

181 codepoints such as emoji. 

182 

183 - `remove_control_chars`: True 

184 

185 Remove certain control characters that have no displayed effect on text. 

186 

187 - `normalization`: "NFC" 

188 

189 Choose what kind of Unicode normalization is applied. Usually, we apply 

190 NFC normalization, so that letters followed by combining characters become 

191 single combined characters. 

192 

193 Changing this to "NFKC" applies more compatibility conversions, such as 

194 replacing the 'micro sign' with a standard Greek lowercase mu, which looks 

195 identical. However, some NFKC normalizations change the meaning of text, 

196 such as converting "10³" to "103". 

197 

198 `normalization` can be None, to apply no normalization. 

199 

200 - `max_decode_length`: 1_000_000 

201 

202 The maximum size of "segment" that ftfy will try to fix all at once. 

203 

204 - `explain`: True 

205 

206 Whether to compute 'explanations', lists describing what ftfy changed. 

207 When this is False, the explanation will be None, and the code that 

208 builds the explanation will be skipped, possibly saving time. 

209 

210 Functions that accept TextFixerConfig and don't return an explanation 

211 will automatically set `explain` to False. 

212 """ 

213 unescape_html: Union[str, bool] = "auto" 

214 remove_terminal_escapes: bool = True 

215 fix_encoding: bool = True 

216 restore_byte_a0: bool = True 

217 replace_lossy_sequences: bool = True 

218 decode_inconsistent_utf8: bool = True 

219 fix_c1_controls: bool = True 

220 fix_latin_ligatures: bool = True 

221 fix_character_width: bool = True 

222 uncurl_quotes: bool = True 

223 fix_line_breaks: bool = True 

224 fix_surrogates: bool = True 

225 remove_control_chars: bool = True 

226 normalization: Optional[str] = "NFC" 

227 max_decode_length: int = 1000000 

228 explain: bool = True 

229 

230 

231def _config_from_kwargs( 

232 config: TextFixerConfig, kwargs: Dict[str, Any] 

233) -> TextFixerConfig: 

234 """ 

235 Handle parameters provided as keyword arguments to ftfy's top-level 

236 functions, converting them into a TextFixerConfig. 

237 """ 

238 if "fix_entities" in kwargs: 

239 warnings.warn( 

240 "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning 

241 ) 

242 kwargs = kwargs.copy() 

243 kwargs["unescape_html"] = kwargs["fix_entities"] 

244 del kwargs["fix_entities"] 

245 config = config._replace(**kwargs) 

246 return config 

247 

248 

249BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. 

250 

251ftfy is designed to fix problems with text. Treating bytes like they're 

252interchangeable with Unicode text is usually something that introduces 

253problems with text. 

254 

255You should first decode these bytes from the encoding you think they're in. 

256If you're not sure what encoding they're in: 

257 

258- First, try to find out. 'utf-8' is a good assumption. 

259- If the encoding is simply unknowable, try running your bytes through 

260 ftfy.guess_bytes. As the name implies, this may not always be accurate. 

261 

262For more information on the distinction between bytes and text, read the 

263Python Unicode HOWTO: 

264 

265 http://docs.python.org/3/howto/unicode.html 

266""" 

267 

268 

269def _try_fix( 

270 fixer_name: str, 

271 text: str, 

272 config: TextFixerConfig, 

273 steps: Optional[List[ExplanationStep]], 

274) -> str: 

275 """ 

276 A helper function used across several 'fixer' steps, deciding whether to 

277 apply the fix and whether to record the fix in `steps`. 

278 """ 

279 if getattr(config, fixer_name): 

280 fixer = FIXERS[fixer_name] 

281 fixed = fixer(text) 

282 if steps is not None and fixed != text: 

283 steps.append(ExplanationStep("apply", fixer_name)) 

284 return cast(str, fixed) 

285 

286 return text 

287 

288 

289def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str: 

290 r""" 

291 Given Unicode text as input, fix inconsistencies and glitches in it, 

292 such as mojibake (text that was decoded in the wrong encoding). 

293 

294 Let's start with some examples: 

295 

296 >>> fix_text('✔ No problems') 

297 '✔ No problems' 

298 

299 >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;")) 

300 ¯\_(ツ)_/¯ 

301 

302 >>> fix_text('Broken text&hellip; it&#x2019;s flubberific!') 

303 "Broken text... it's flubberific!" 

304 

305 >>> fix_text('LOUD NOISES') 

306 'LOUD NOISES' 

307 

308 ftfy applies a number of different fixes to the text, and can accept 

309 configuration to select which fixes to apply. 

310 

311 The configuration takes the form of a :class:`TextFixerConfig` object, 

312 and you can see a description of the options in that class's docstring 

313 or in the full documentation at ftfy.readthedocs.org. 

314 

315 For convenience and backward compatibility, the configuration can also 

316 take the form of keyword arguments, which will set the equivalently-named 

317 fields of the TextFixerConfig object. 

318 

319 For example, here are two ways to fix text but skip the "uncurl_quotes" 

320 step:: 

321 

322 fix_text(text, TextFixerConfig(uncurl_quotes=False)) 

323 fix_text(text, uncurl_quotes=False) 

324 

325 This function fixes text in independent segments, which are usually lines 

326 of text, or arbitrarily broken up every 1 million codepoints (configurable 

327 with `config.max_decode_length`) if there aren't enough line breaks. The 

328 bound on segment lengths helps to avoid unbounded slowdowns. 

329 

330 ftfy can also provide an 'explanation', a list of transformations it applied 

331 to the text that would fix more text like it. This function doesn't provide 

332 explanations (because there may be different fixes for different segments 

333 of text). 

334 

335 To get an explanation, use the :func:`fix_and_explain()` function, which 

336 fixes the string in one segment and explains what it fixed. 

337 """ 

338 

339 if config is None: 

340 config = TextFixerConfig(explain=False) 

341 config = _config_from_kwargs(config, kwargs) 

342 if isinstance(text, bytes): 

343 raise UnicodeError(BYTES_ERROR_TEXT) 

344 

345 out = [] 

346 pos = 0 

347 while pos < len(text): 

348 textbreak = text.find("\n", pos) + 1 

349 if textbreak == 0: 

350 textbreak = len(text) 

351 if (textbreak - pos) > config.max_decode_length: 

352 textbreak = pos + config.max_decode_length 

353 

354 segment = text[pos:textbreak] 

355 if config.unescape_html == "auto" and "<" in segment: 

356 config = config._replace(unescape_html=False) 

357 fixed_segment, _ = fix_and_explain(segment, config) 

358 out.append(fixed_segment) 

359 pos = textbreak 

360 return "".join(out) 

361 

362 

363def fix_and_explain( 

364 text: str, config: Optional[TextFixerConfig] = None, **kwargs 

365) -> ExplainedText: 

366 """ 

367 Fix text as a single segment, returning the fixed text and an explanation 

368 of what was fixed. 

369 

370 The explanation is a list of steps that can be applied with 

371 :func:`apply_plan`, or if config.explain is False, it will be None. 

372 """ 

373 if config is None: 

374 config = TextFixerConfig() 

375 if isinstance(text, bytes): 

376 raise UnicodeError(BYTES_ERROR_TEXT) 

377 config = _config_from_kwargs(config, kwargs) 

378 

379 if config.unescape_html == "auto" and "<" in text: 

380 config = config._replace(unescape_html=False) 

381 

382 if config.explain: 

383 steps: Optional[List[ExplanationStep]] = [] 

384 else: 

385 # If explanations aren't desired, `steps` will be None 

386 steps = None 

387 

388 while True: 

389 origtext = text 

390 

391 text = _try_fix("unescape_html", text, config, steps) 

392 

393 if config.fix_encoding: 

394 if steps is None: 

395 text = fix_encoding(text) 

396 else: 

397 text, encoding_steps = fix_encoding_and_explain(text, config) 

398 if encoding_steps is not None: 

399 steps.extend(encoding_steps) 

400 

401 for fixer in [ 

402 "fix_c1_controls", 

403 "fix_latin_ligatures", 

404 "fix_character_width", 

405 "uncurl_quotes", 

406 "fix_line_breaks", 

407 "fix_surrogates", 

408 "remove_terminal_escapes", 

409 "remove_control_chars", 

410 ]: 

411 text = _try_fix(fixer, text, config, steps) 

412 

413 if config.normalization is not None: 

414 fixed = unicodedata.normalize(config.normalization, text) 

415 if steps is not None and fixed != text: 

416 steps.append(ExplanationStep("normalize", config.normalization)) 

417 text = fixed 

418 

419 if text == origtext: 

420 return ExplainedText(text, steps) 

421 

422 

423def fix_encoding_and_explain( 

424 text: str, config: Optional[TextFixerConfig] = None, **kwargs 

425) -> ExplainedText: 

426 """ 

427 Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed 

428 text and a list explaining what was fixed. 

429 

430 This includes fixing text by encoding and decoding it in different encodings, 

431 as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`, 

432 `decode_inconsistent_utf8`, and `fix_c1_controls`. 

433 

434 Examples:: 

435 

436 >>> fix_encoding_and_explain("só") 

437 ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')]) 

438 

439 >>> result = fix_encoding_and_explain("voilà le travail") 

440 >>> result.text 

441 'voilà le travail' 

442 >>> result.explanation 

443 [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')] 

444 

445 """ 

446 if config is None: 

447 config = TextFixerConfig() 

448 if isinstance(text, bytes): 

449 raise UnicodeError(BYTES_ERROR_TEXT) 

450 config = _config_from_kwargs(config, kwargs) 

451 

452 if not config.fix_encoding: 

453 # A weird trivial case: we're asked to fix the encoding, but skip 

454 # fixing the encoding 

455 return ExplainedText(text, []) 

456 

457 plan_so_far: List[ExplanationStep] = [] 

458 while True: 

459 prevtext = text 

460 text, plan = _fix_encoding_one_step_and_explain(text, config) 

461 if plan is not None: 

462 plan_so_far.extend(plan) 

463 if text == prevtext: 

464 return ExplainedText(text, plan_so_far) 

465 

466 

467def _fix_encoding_one_step_and_explain( 

468 text: str, config: TextFixerConfig 

469) -> ExplainedText: 

470 """ 

471 Perform one step of fixing the encoding of text. 

472 """ 

473 if config is None: 

474 config = TextFixerConfig() 

475 

476 if len(text) == 0: 

477 return ExplainedText(text, []) 

478 

479 # The first plan is to return ASCII text unchanged, as well as text 

480 # that doesn't look like it contains mojibake 

481 if chardata.possible_encoding(text, "ascii") or not is_bad(text): 

482 return ExplainedText(text, []) 

483 

484 # As we go through the next step, remember the possible encodings 

485 # that we encounter but don't successfully fix yet. We may need them 

486 # later. 

487 possible_1byte_encodings = [] 

488 

489 # Suppose the text was supposed to be UTF-8, but it was decoded using 

490 # a single-byte encoding instead. When these cases can be fixed, they 

491 # are usually the correct thing to do, so try them next. 

492 for encoding in chardata.CHARMAP_ENCODINGS: 

493 if chardata.possible_encoding(text, encoding): 

494 possible_1byte_encodings.append(encoding) 

495 encoded_bytes = text.encode(encoding) 

496 encode_step = ExplanationStep("encode", encoding) 

497 transcode_steps = [] 

498 

499 # Now, find out if it's UTF-8 (or close enough). Otherwise, 

500 # remember the encoding for later. 

501 try: 

502 decoding = "utf-8" 

503 # Check encoded_bytes for sequences that would be UTF-8, 

504 # except they have b' ' where b'\xa0' would belong. 

505 if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search( 

506 encoded_bytes 

507 ): 

508 replaced_bytes = fixes.restore_byte_a0(encoded_bytes) 

509 if replaced_bytes != encoded_bytes: 

510 transcode_steps.append( 

511 ExplanationStep("transcode", "restore_byte_a0") 

512 ) 

513 encoded_bytes = replaced_bytes 

514 

515 # Replace sequences where information has been lost 

516 if config.replace_lossy_sequences and encoding.startswith("sloppy"): 

517 replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) 

518 if replaced_bytes != encoded_bytes: 

519 transcode_steps.append( 

520 ExplanationStep("transcode", "replace_lossy_sequences") 

521 ) 

522 encoded_bytes = replaced_bytes 

523 

524 if 0xED in encoded_bytes or 0xC0 in encoded_bytes: 

525 decoding = "utf-8-variants" 

526 

527 decode_step = ExplanationStep("decode", decoding) 

528 steps = [encode_step] + transcode_steps + [decode_step] 

529 fixed = encoded_bytes.decode(decoding) 

530 return ExplainedText(fixed, steps) 

531 

532 except UnicodeDecodeError: 

533 pass 

534 

535 # Look for a-hat-euro sequences that remain, and fix them in isolation. 

536 if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): 

537 steps = [ExplanationStep("apply", "decode_inconsistent_utf8")] 

538 fixed = fixes.decode_inconsistent_utf8(text) 

539 if fixed != text: 

540 return ExplainedText(fixed, steps) 

541 

542 # The next most likely case is that this is Latin-1 that was intended to 

543 # be read as Windows-1252, because those two encodings in particular are 

544 # easily confused. 

545 if "latin-1" in possible_1byte_encodings: 

546 if "windows-1252" in possible_1byte_encodings: 

547 # This text is in the intersection of Latin-1 and 

548 # Windows-1252, so it's probably legit. 

549 return ExplainedText(text, []) 

550 else: 

551 # Otherwise, it means we have characters that are in Latin-1 but 

552 # not in Windows-1252. Those are C1 control characters. Nobody 

553 # wants those. Assume they were meant to be Windows-1252. 

554 try: 

555 fixed = text.encode("latin-1").decode("windows-1252") 

556 if fixed != text: 

557 steps = [ 

558 ExplanationStep("encode", "latin-1"), 

559 ExplanationStep("decode", "windows-1252"), 

560 ] 

561 return ExplainedText(fixed, steps) 

562 except UnicodeDecodeError: 

563 pass 

564 

565 # Fix individual characters of Latin-1 with a less satisfying explanation 

566 if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): 

567 steps = [ExplanationStep("transcode", "fix_c1_controls")] 

568 fixed = fixes.fix_c1_controls(text) 

569 return ExplainedText(fixed, steps) 

570 

571 # The cases that remain are mixups between two different single-byte 

572 # encodings, and not the common case of Latin-1 vs. Windows-1252. 

573 # 

574 # With the new heuristic in 6.0, it's possible that we're closer to solving 

575 # these in some cases. It would require a lot of testing and tuning, though. 

576 # For now, we leave the text unchanged in these cases. 

577 return ExplainedText(text, []) 

578 

579 

580def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs): 

581 """ 

582 Apply just the encoding-fixing steps of ftfy to this text. Returns the 

583 fixed text, discarding the explanation. 

584 

585 >>> fix_encoding("ó") 

586 'ó' 

587 >>> fix_encoding("&ATILDE;&SUP3;") 

588 '&ATILDE;&SUP3;' 

589 """ 

590 if config is None: 

591 config = TextFixerConfig(explain=False) 

592 config = _config_from_kwargs(config, kwargs) 

593 fixed, _explan = fix_encoding_and_explain(text, config) 

594 return fixed 

595 

596 

597# Some alternate names for the main functions 

598ftfy = fix_text 

599 

600 

601def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs): 

602 """ 

603 Fix text as a single segment, with a consistent sequence of steps that 

604 are applied to fix the text. Discard the explanation. 

605 """ 

606 if config is None: 

607 config = TextFixerConfig(explain=False) 

608 config = _config_from_kwargs(config, kwargs) 

609 fixed, _explan = fix_and_explain(text, config) 

610 return fixed 

611 

612 

613def fix_file( 

614 input_file: TextIO, 

615 encoding: Optional[str] = None, 

616 config: Optional[TextFixerConfig] = None, 

617 **kwargs 

618) -> Iterator[str]: 

619 """ 

620 Fix text that is found in a file. 

621 

622 If the file is being read as Unicode text, use that. If it's being read as 

623 bytes, then we hope an encoding was supplied. If not, unfortunately, we 

624 have to guess what encoding it is. We'll try a few common encodings, but we 

625 make no promises. See the `guess_bytes` function for how this is done. 

626 

627 The output is a stream of fixed lines of text. 

628 """ 

629 if config is None: 

630 config = TextFixerConfig() 

631 config = _config_from_kwargs(config, kwargs) 

632 

633 for line in input_file: 

634 if isinstance(line, bytes): 

635 if encoding is None: 

636 line, encoding = guess_bytes(line) 

637 else: 

638 line = line.decode(encoding) 

639 if config.unescape_html == "auto" and "<" in line: 

640 config = config._replace(unescape_html=False) 

641 

642 fixed_line, _explan = fix_and_explain(line, config) 

643 yield fixed_line 

644 

645 

646def guess_bytes(bstring: bytes) -> Tuple[str, str]: 

647 """ 

648 NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy 

649 is not designed to be an encoding detector. 

650 

651 In the unfortunate situation that you have some bytes in an unknown 

652 encoding, ftfy can guess a reasonable strategy for decoding them, by trying 

653 a few common encodings that can be distinguished from each other. 

654 

655 Unlike the rest of ftfy, this may not be accurate, and it may *create* 

656 Unicode problems instead of solving them! 

657 

658 The encodings we try here are: 

659 

660 - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks 

661 like nothing else 

662 - UTF-8, because it's the global standard, which has been used by a 

663 majority of the Web since 2008 

664 - "utf-8-variants", or buggy implementations of UTF-8 

665 - MacRoman, because Microsoft Office thinks it's still a thing, and it 

666 can be distinguished by its line breaks. (If there are no line breaks in 

667 the string, though, you're out of luck.) 

668 - "sloppy-windows-1252", the Latin-1-like encoding that is the most common 

669 single-byte encoding. 

670 """ 

671 if isinstance(bstring, str): 

672 raise UnicodeError( 

673 "This string was already decoded as Unicode. You should pass " 

674 "bytes to guess_bytes, not Unicode." 

675 ) 

676 

677 if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): 

678 return bstring.decode("utf-16"), "utf-16" 

679 

680 byteset = set(bstring) 

681 try: 

682 if 0xED in byteset or 0xC0 in byteset: 

683 # Byte 0xed can be used to encode a range of codepoints that 

684 # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, 

685 # so when we see 0xed, it's very likely we're being asked to 

686 # decode CESU-8, the variant that encodes UTF-16 surrogates 

687 # instead of the original characters themselves. 

688 # 

689 # This will occasionally trigger on standard UTF-8, as there 

690 # are some Korean characters that also use byte 0xed, but that's 

691 # not harmful because standard UTF-8 characters will decode the 

692 # same way in our 'utf-8-variants' codec. 

693 # 

694 # Byte 0xc0 is impossible because, numerically, it would only 

695 # encode characters lower than U+0040. Those already have 

696 # single-byte representations, and UTF-8 requires using the 

697 # shortest possible representation. However, Java hides the null 

698 # codepoint, U+0000, in a non-standard longer representation -- it 

699 # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00 

700 # will never appear in the encoded bytes. 

701 # 

702 # The 'utf-8-variants' decoder can handle both of these cases, as 

703 # well as standard UTF-8, at the cost of a bit of speed. 

704 return bstring.decode("utf-8-variants"), "utf-8-variants" 

705 else: 

706 return bstring.decode("utf-8"), "utf-8" 

707 except UnicodeDecodeError: 

708 pass 

709 

710 if 0x0D in byteset and 0x0A not in byteset: 

711 # Files that contain CR and not LF are likely to be MacRoman. 

712 return bstring.decode("macroman"), "macroman" 

713 

714 return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252" 

715 

716 

717@no_type_check 

718def apply_plan(text: str, plan: List[Tuple[str, str]]): 

719 """ 

720 Apply a plan for fixing the encoding of text. 

721 

722 The plan is a list of tuples of the form (operation, arg). 

723 

724 `operation` is one of: 

725 

726 - `'encode'`: convert a string to bytes, using `arg` as the encoding 

727 - `'decode'`: convert bytes to a string, using `arg` as the encoding 

728 - `'transcode'`: convert bytes to bytes, using the function named `arg` 

729 - `'apply'`: convert a string to a string, using the function named `arg` 

730 

731 The functions that can be applied by 'transcode' and 'apply' are 

732 specifically those that appear in the dictionary named `FIXERS`. They 

733 can also can be imported from the `ftfy.fixes` module. 

734 

735 Example:: 

736 

737 >>> mojibake = "schön" 

738 >>> text, plan = fix_and_explain(mojibake) 

739 >>> apply_plan(mojibake, plan) 

740 'schön' 

741 """ 

742 obj = text 

743 for operation, encoding in plan: 

744 if operation == "encode": 

745 obj = obj.encode(encoding) 

746 elif operation == "decode": 

747 obj = obj.decode(encoding) 

748 elif operation in ("transcode", "apply"): 

749 if encoding in FIXERS: 

750 obj = FIXERS[encoding](obj) 

751 else: 

752 raise ValueError("Unknown function to apply: %s" % encoding) 

753 else: 

754 raise ValueError("Unknown plan step: %s" % operation) 

755 

756 return obj 

757 

758 

759def explain_unicode(text: str): 

760 """ 

761 A utility method that's useful for debugging mysterious Unicode. 

762 

763 It breaks down a string, showing you for each codepoint its number in 

764 hexadecimal, its glyph, its category in the Unicode standard, and its name 

765 in the Unicode standard. 

766 

767 >>> explain_unicode('(╯°□°)╯︵ ┻━┻') 

768 U+0028 ( [Ps] LEFT PARENTHESIS 

769 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT 

770 U+00B0 ° [So] DEGREE SIGN 

771 U+25A1 □ [So] WHITE SQUARE 

772 U+00B0 ° [So] DEGREE SIGN 

773 U+0029 ) [Pe] RIGHT PARENTHESIS 

774 U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT 

775 U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS 

776 U+0020 [Zs] SPACE 

777 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL 

778 U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL 

779 U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL 

780 """ 

781 for char in text: 

782 if char.isprintable(): 

783 display = char 

784 else: 

785 display = char.encode("unicode-escape").decode("ascii") 

786 print( 

787 "U+{code:04X} {display} [{category}] {name}".format( 

788 display=display_ljust(display, 7), 

789 code=ord(char), 

790 category=unicodedata.category(char), 

791 name=unicodedata.name(char, "<unknown>"), 

792 ) 

793 )