Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 39%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

367 statements  

1# -*- coding: utf-8 -*- 

2"""Beautiful Soup bonus library: Unicode, Dammit 

3 

4This library converts a bytestream to Unicode through any means 

5necessary. It is heavily based on code from Mark Pilgrim's `Universal 

6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained 

7by Kurt McKee. It does not rewrite the body of an XML or HTML document 

8to reflect a new encoding; that's the job of `TreeBuilder`. 

9 

10""" 

11 

12# Use of this source code is governed by the MIT license. 

13__license__ = "MIT" 

14 

15from html.entities import codepoint2name 

16from collections import defaultdict 

17import codecs 

18from html.entities import html5 

19import re 

20from logging import Logger, getLogger 

21from types import ModuleType 

22from typing import ( 

23 Dict, 

24 Iterator, 

25 List, 

26 Optional, 

27 Pattern, 

28 Set, 

29 Tuple, 

30 Type, 

31 Union, 

32 cast, 

33) 

34from typing_extensions import Literal 

35from bs4._typing import ( 

36 _Encoding, 

37 _Encodings, 

38) 

39import warnings 

40 

41# Import a library to autodetect character encodings. We'll support 

42# any of a number of libraries that all support the same API: 

43# 

44# * cchardet 

45# * chardet 

46# * charset-normalizer 

47chardet_module: Optional[ModuleType] = None 

48try: 

49 # PyPI package: cchardet 

50 import cchardet 

51 

52 chardet_module = cchardet 

53except ImportError: 

54 try: 

55 # Debian package: python-chardet 

56 # PyPI package: chardet 

57 import chardet 

58 

59 chardet_module = chardet 

60 except ImportError: 

61 try: 

62 # PyPI package: charset-normalizer 

63 import charset_normalizer 

64 

65 chardet_module = charset_normalizer 

66 except ImportError: 

67 # No chardet available. 

68 pass 

69 

70 

71def _chardet_dammit(s: bytes) -> Optional[str]: 

72 """Try as hard as possible to detect the encoding of a bytestring.""" 

73 if chardet_module is None or isinstance(s, str): 

74 return None 

75 module = chardet_module 

76 return module.detect(s)["encoding"] 

77 

78 

79# Build bytestring and Unicode versions of regular expressions for finding 

80# a declared encoding inside an XML or HTML document. 

81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private: 

82html_meta: str = ( 

83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private: 

84) 

85 

86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky. 

87encoding_res: Dict[Type, Dict[str, Pattern]] = dict() 

88encoding_res[bytes] = { 

89 "html": re.compile(html_meta.encode("ascii"), re.I), 

90 "xml": re.compile(xml_encoding.encode("ascii"), re.I), 

91} 

92encoding_res[str] = { 

93 "html": re.compile(html_meta, re.I), 

94 "xml": re.compile(xml_encoding, re.I), 

95} 

96 

97 

98class EntitySubstitution(object): 

99 """The ability to substitute XML or HTML entities for certain characters.""" 

100 

101 #: A map of named HTML entities to the corresponding Unicode string. 

102 #: 

103 #: :meta hide-value: 

104 HTML_ENTITY_TO_CHARACTER: Dict[str, str] 

105 

106 #: A map of Unicode strings to the corresponding named HTML entities; 

107 #: the inverse of HTML_ENTITY_TO_CHARACTER. 

108 #: 

109 #: :meta hide-value: 

110 CHARACTER_TO_HTML_ENTITY: Dict[str, str] 

111 

112 #: A regular expression that matches any character (or, in rare 

113 #: cases, pair of characters) that can be replaced with a named 

114 #: HTML entity. 

115 #: 

116 #: :meta hide-value: 

117 CHARACTER_TO_HTML_ENTITY_RE: Pattern[str] 

118 

119 #: A very similar regular expression to 

120 #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped 

121 #: ampersands. This is used by the 'html' formatted to provide 

122 #: backwards-compatibility, even though the HTML5 spec allows most 

123 #: ampersands to go unescaped. 

124 #: 

125 #: :meta hide-value: 

126 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str] 

127 

128 @classmethod 

129 def _populate_class_variables(cls) -> None: 

130 """Initialize variables used by this class to manage the plethora of 

131 HTML5 named entities. 

132 

133 This function sets the following class variables: 

134 

135 CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to 

136 entity names like "angmsdaa". When a single Unicode string has 

137 multiple entity names, we try to choose the most commonly-used 

138 name. 

139 

140 HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to 

141 Unicode strings like "⦨". 

142 

143 CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any 

144 Unicode string that corresponds to an HTML5 named entity. 

145 

146 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar 

147 regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which 

148 also matches unescaped ampersands. This is used by the 'html' 

149 formatted to provide backwards-compatibility, even though the HTML5 

150 spec allows most ampersands to go unescaped. 

151 """ 

152 unicode_to_name = {} 

153 name_to_unicode = {} 

154 

155 short_entities = set() 

156 long_entities_by_first_character = defaultdict(set) 

157 

158 for name_with_semicolon, character in sorted(html5.items()): 

159 # "It is intentional, for legacy compatibility, that many 

160 # code points have multiple character reference names. For 

161 # example, some appear both with and without the trailing 

162 # semicolon, or with different capitalizations." 

163 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references 

164 # 

165 # The parsers are in charge of handling (or not) character 

166 # references with no trailing semicolon, so we remove the 

167 # semicolon whenever it appears. 

168 if name_with_semicolon.endswith(";"): 

169 name = name_with_semicolon[:-1] 

170 else: 

171 name = name_with_semicolon 

172 

173 # When parsing HTML, we want to recognize any known named 

174 # entity and convert it to a sequence of Unicode 

175 # characters. 

176 if name not in name_to_unicode: 

177 name_to_unicode[name] = character 

178 

179 # When _generating_ HTML, we want to recognize special 

180 # character sequences that _could_ be converted to named 

181 # entities. 

182 unicode_to_name[character] = name 

183 

184 # We also need to build a regular expression that lets us 

185 # _find_ those characters in output strings so we can 

186 # replace them. 

187 # 

188 # This is tricky, for two reasons. 

189 

190 if len(character) == 1 and ord(character) < 128 and character not in "<>": 

191 # First, it would be annoying to turn single ASCII 

192 # characters like | into named entities like 

193 # &verbar;. The exceptions are <>, which we _must_ 

194 # turn into named entities to produce valid HTML. 

195 continue 

196 

197 if len(character) > 1 and all(ord(x) < 128 for x in character): 

198 # We also do not want to turn _combinations_ of ASCII 

199 # characters like 'fj' into named entities like '&fjlig;', 

200 # though that's more debateable. 

201 continue 

202 

203 # Second, some named entities have a Unicode value that's 

204 # a subset of the Unicode value for some _other_ named 

205 # entity. As an example, \u2267' is &GreaterFullEqual;, 

206 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular 

207 # expression needs to match the first two characters of 

208 # "\u2267\u0338foo", but only the first character of 

209 # "\u2267foo". 

210 # 

211 # In this step, we build two sets of characters that 

212 # _eventually_ need to go into the regular expression. But 

213 # we won't know exactly what the regular expression needs 

214 # to look like until we've gone through the entire list of 

215 # named entities. 

216 if len(character) == 1 and character != "&": 

217 short_entities.add(character) 

218 else: 

219 long_entities_by_first_character[character[0]].add(character) 

220 

221 # Now that we've been through the entire list of entities, we 

222 # can create a regular expression that matches any of them. 

223 particles = set() 

224 for short in short_entities: 

225 long_versions = long_entities_by_first_character[short] 

226 if not long_versions: 

227 particles.add(short) 

228 else: 

229 ignore = "".join([x[1] for x in long_versions]) 

230 # This finds, e.g. \u2267 but only if it is _not_ 

231 # followed by \u0338. 

232 particles.add("%s(?![%s])" % (short, ignore)) 

233 

234 for long_entities in list(long_entities_by_first_character.values()): 

235 for long_entity in long_entities: 

236 particles.add(long_entity) 

237 

238 re_definition = "(%s)" % "|".join(particles) 

239 

240 particles.add("&") 

241 re_definition_with_ampersand = "(%s)" % "|".join(particles) 

242 

243 # If an entity shows up in both html5 and codepoint2name, it's 

244 # likely that HTML5 gives it several different names, such as 

245 # 'rsquo' and 'rsquor'. When converting Unicode characters to 

246 # named entities, the codepoint2name name should take 

247 # precedence where possible, since that's the more easily 

248 # recognizable one. 

249 for codepoint, name in list(codepoint2name.items()): 

250 character = chr(codepoint) 

251 unicode_to_name[character] = name 

252 

253 cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name 

254 cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode 

255 cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) 

256 cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile( 

257 re_definition_with_ampersand 

258 ) 

259 

260 #: A map of Unicode strings to the corresponding named XML entities. 

261 #: 

262 #: :meta hide-value: 

263 CHARACTER_TO_XML_ENTITY: Dict[str, str] = { 

264 "'": "apos", 

265 '"': "quot", 

266 "&": "amp", 

267 "<": "lt", 

268 ">": "gt", 

269 } 

270 

271 # Matches any named or numeric HTML entity. 

272 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I) 

273 

274 #: A regular expression matching an angle bracket or an ampersand that 

275 #: is not part of an XML or HTML entity. 

276 #: 

277 #: :meta hide-value: 

278 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile( 

279 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")" 

280 ) 

281 

282 #: A regular expression matching an angle bracket or an ampersand. 

283 #: 

284 #: :meta hide-value: 

285 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])") 

286 

287 @classmethod 

288 def _substitute_html_entity(cls, matchobj: re.Match) -> str: 

289 """Used with a regular expression to substitute the 

290 appropriate HTML entity for a special character string.""" 

291 original_entity = matchobj.group(0) 

292 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity) 

293 if entity is None: 

294 return "&amp;%s;" % original_entity 

295 return "&%s;" % entity 

296 

297 @classmethod 

298 def _substitute_xml_entity(cls, matchobj: re.Match) -> str: 

299 """Used with a regular expression to substitute the 

300 appropriate XML entity for a special character string.""" 

301 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 

302 return "&%s;" % entity 

303 

304 @classmethod 

305 def _escape_entity_name(cls, matchobj: re.Match) -> str: 

306 return "&amp;%s;" % matchobj.group(1) 

307 

308 @classmethod 

309 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str: 

310 possible_entity = matchobj.group(1) 

311 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER: 

312 return "&%s;" % possible_entity 

313 return "&amp;%s;" % possible_entity 

314 

315 @classmethod 

316 def quoted_attribute_value(cls, value: str) -> str: 

317 """Make a value into a quoted XML attribute, possibly escaping it. 

318 

319 Most strings will be quoted using double quotes. 

320 

321 Bob's Bar -> "Bob's Bar" 

322 

323 If a string contains double quotes, it will be quoted using 

324 single quotes. 

325 

326 Welcome to "my bar" -> 'Welcome to "my bar"' 

327 

328 If a string contains both single and double quotes, the 

329 double quotes will be escaped, and the string will be quoted 

330 using double quotes. 

331 

332 Welcome to "Bob's Bar" -> Welcome to &quot;Bob's bar&quot; 

333 

334 :param value: The XML attribute value to quote 

335 :return: The quoted value 

336 """ 

337 quote_with = '"' 

338 if '"' in value: 

339 if "'" in value: 

340 # The string contains both single and double 

341 # quotes. Turn the double quotes into 

342 # entities. We quote the double quotes rather than 

343 # the single quotes because the entity name is 

344 # "&quot;" whether this is HTML or XML. If we 

345 # quoted the single quotes, we'd have to decide 

346 # between &apos; and &squot;. 

347 replace_with = "&quot;" 

348 value = value.replace('"', replace_with) 

349 else: 

350 # There are double quotes but no single quotes. 

351 # We can use single quotes to quote the attribute. 

352 quote_with = "'" 

353 return quote_with + value + quote_with 

354 

355 @classmethod 

356 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str: 

357 """Replace special XML characters with named XML entities. 

358 

359 The less-than sign will become &lt;, the greater-than sign 

360 will become &gt;, and any ampersands will become &amp;. If you 

361 want ampersands that seem to be part of an entity definition 

362 to be left alone, use `substitute_xml_containing_entities` 

363 instead. 

364 

365 :param value: A string to be substituted. 

366 

367 :param make_quoted_attribute: If True, then the string will be 

368 quoted, as befits an attribute value. 

369 

370 :return: A version of ``value`` with special characters replaced 

371 with named entities. 

372 """ 

373 # Escape angle brackets and ampersands. 

374 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) 

375 

376 if make_quoted_attribute: 

377 value = cls.quoted_attribute_value(value) 

378 return value 

379 

380 @classmethod 

381 def substitute_xml_containing_entities( 

382 cls, value: str, make_quoted_attribute: bool = False 

383 ) -> str: 

384 """Substitute XML entities for special XML characters. 

385 

386 :param value: A string to be substituted. The less-than sign will 

387 become &lt;, the greater-than sign will become &gt;, and any 

388 ampersands that are not part of an entity defition will 

389 become &amp;. 

390 

391 :param make_quoted_attribute: If True, then the string will be 

392 quoted, as befits an attribute value. 

393 """ 

394 # Escape angle brackets, and ampersands that aren't part of 

395 # entities. 

396 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) 

397 

398 if make_quoted_attribute: 

399 value = cls.quoted_attribute_value(value) 

400 return value 

401 

402 @classmethod 

403 def substitute_html(cls, s: str) -> str: 

404 """Replace certain Unicode characters with named HTML entities. 

405 

406 This differs from ``data.encode(encoding, 'xmlcharrefreplace')`` 

407 in that the goal is to make the result more readable (to those 

408 with ASCII displays) rather than to recover from 

409 errors. There's absolutely nothing wrong with a UTF-8 string 

410 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 

411 character with "&eacute;" will make it more readable to some 

412 people. 

413 

414 :param s: The string to be modified. 

415 :return: The string with some Unicode characters replaced with 

416 HTML entities. 

417 """ 

418 # Convert any appropriate characters to HTML entities. 

419 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub( 

420 cls._substitute_html_entity, s 

421 ) 

422 

423 @classmethod 

424 def substitute_html5(cls, s: str) -> str: 

425 """Replace certain Unicode characters with named HTML entities 

426 using HTML5 rules. 

427 

428 Specifically, this method is much less aggressive about 

429 escaping ampersands than substitute_html. Only ambiguous 

430 ampersands are escaped, per the HTML5 standard: 

431 

432 "An ambiguous ampersand is a U+0026 AMPERSAND character (&) 

433 that is followed by one or more ASCII alphanumerics, followed 

434 by a U+003B SEMICOLON character (;), where these characters do 

435 not match any of the names given in the named character 

436 references section." 

437 

438 Unlike substitute_html5_raw, this method assumes HTML entities 

439 were converted to Unicode characters on the way in, as 

440 Beautiful Soup does. By the time Beautiful Soup does its work, 

441 the only ambiguous ampersands that need to be escaped are the 

442 ones that were escaped in the original markup when mentioning 

443 HTML entities. 

444 

445 :param s: The string to be modified. 

446 :return: The string with some Unicode characters replaced with 

447 HTML entities. 

448 """ 

449 # First, escape any HTML entities found in the markup. 

450 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s) 

451 

452 # Next, convert any appropriate characters to unescaped HTML entities. 

453 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) 

454 

455 return s 

456 

457 @classmethod 

458 def substitute_html5_raw(cls, s: str) -> str: 

459 """Replace certain Unicode characters with named HTML entities 

460 using HTML5 rules. 

461 

462 substitute_html5_raw is similar to substitute_html5 but it is 

463 designed for standalone use (whereas substitute_html5 is 

464 designed for use with Beautiful Soup). 

465 

466 :param s: The string to be modified. 

467 :return: The string with some Unicode characters replaced with 

468 HTML entities. 

469 """ 

470 # First, escape the ampersand for anything that looks like an 

471 # entity but isn't in the list of recognized entities. All other 

472 # ampersands can be left alone. 

473 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s) 

474 

475 # Then, convert a range of Unicode characters to unescaped 

476 # HTML entities. 

477 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) 

478 

479 return s 

480 

481 

482EntitySubstitution._populate_class_variables() 

483 

484 

485class EncodingDetector: 

486 """This class is capable of guessing a number of possible encodings 

487 for a bytestring. 

488 

489 Order of precedence: 

490 

491 1. Encodings you specifically tell EncodingDetector to try first 

492 (the ``known_definite_encodings`` argument to the constructor). 

493 

494 2. An encoding determined by sniffing the document's byte-order mark. 

495 

496 3. Encodings you specifically tell EncodingDetector to try if 

497 byte-order mark sniffing fails (the ``user_encodings`` argument to the 

498 constructor). 

499 

500 4. An encoding declared within the bytestring itself, either in an 

501 XML declaration (if the bytestring is to be interpreted as an XML 

502 document), or in a <meta> tag (if the bytestring is to be 

503 interpreted as an HTML document.) 

504 

505 5. An encoding detected through textual analysis by chardet, 

506 cchardet, or a similar external library. 

507 

508 6. UTF-8. 

509 

510 7. Windows-1252. 

511 

512 :param markup: Some markup in an unknown encoding. 

513 

514 :param known_definite_encodings: When determining the encoding 

515 of ``markup``, these encodings will be tried first, in 

516 order. In HTML terms, this corresponds to the "known 

517 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. 

518 

519 :param user_encodings: These encodings will be tried after the 

520 ``known_definite_encodings`` have been tried and failed, and 

521 after an attempt to sniff the encoding by looking at a 

522 byte order mark has failed. In HTML terms, this 

523 corresponds to the step "user has explicitly instructed 

524 the user agent to override the document's character 

525 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. 

526 

527 :param override_encodings: A **deprecated** alias for 

528 ``known_definite_encodings``. Any encodings here will be tried 

529 immediately after the encodings in 

530 ``known_definite_encodings``. 

531 

532 :param is_html: If True, this markup is considered to be 

533 HTML. Otherwise it's assumed to be XML. 

534 

535 :param exclude_encodings: These encodings will not be tried, 

536 even if they otherwise would be. 

537 

538 """ 

539 

540 def __init__( 

541 self, 

542 markup: bytes, 

543 known_definite_encodings: Optional[_Encodings] = None, 

544 is_html: Optional[bool] = False, 

545 exclude_encodings: Optional[_Encodings] = None, 

546 user_encodings: Optional[_Encodings] = None, 

547 override_encodings: Optional[_Encodings] = None, 

548 ): 

549 self.known_definite_encodings = list(known_definite_encodings or []) 

550 if override_encodings: 

551 warnings.warn( 

552 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.", 

553 DeprecationWarning, 

554 stacklevel=3, 

555 ) 

556 self.known_definite_encodings += override_encodings 

557 self.user_encodings = user_encodings or [] 

558 exclude_encodings = exclude_encodings or [] 

559 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 

560 self.chardet_encoding = None 

561 self.is_html = False if is_html is None else is_html 

562 self.declared_encoding: Optional[str] = None 

563 

564 # First order of business: strip a byte-order mark. 

565 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 

566 

567 known_definite_encodings: _Encodings 

568 user_encodings: _Encodings 

569 exclude_encodings: _Encodings 

570 chardet_encoding: Optional[_Encoding] 

571 is_html: bool 

572 declared_encoding: Optional[_Encoding] 

573 markup: bytes 

574 sniffed_encoding: Optional[_Encoding] 

575 

576 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool: 

577 """Should we even bother to try this encoding? 

578 

579 :param encoding: Name of an encoding. 

580 :param tried: Encodings that have already been tried. This 

581 will be modified as a side effect. 

582 """ 

583 if encoding is None: 

584 return False 

585 encoding = encoding.lower() 

586 if encoding in self.exclude_encodings: 

587 return False 

588 if encoding not in tried: 

589 tried.add(encoding) 

590 return True 

591 return False 

592 

593 @property 

594 def encodings(self) -> Iterator[_Encoding]: 

595 """Yield a number of encodings that might work for this markup. 

596 

597 :yield: A sequence of strings. Each is the name of an encoding 

598 that *might* work to convert a bytestring into Unicode. 

599 """ 

600 tried: Set[_Encoding] = set() 

601 

602 # First, try the known definite encodings 

603 for e in self.known_definite_encodings: 

604 if self._usable(e, tried): 

605 yield e 

606 

607 # Did the document originally start with a byte-order mark 

608 # that indicated its encoding? 

609 if self.sniffed_encoding is not None and self._usable( 

610 self.sniffed_encoding, tried 

611 ): 

612 yield self.sniffed_encoding 

613 

614 # Sniffing the byte-order mark did nothing; try the user 

615 # encodings. 

616 for e in self.user_encodings: 

617 if self._usable(e, tried): 

618 yield e 

619 

620 # Look within the document for an XML or HTML encoding 

621 # declaration. 

622 if self.declared_encoding is None: 

623 self.declared_encoding = self.find_declared_encoding( 

624 self.markup, self.is_html 

625 ) 

626 if self.declared_encoding is not None and self._usable( 

627 self.declared_encoding, tried 

628 ): 

629 yield self.declared_encoding 

630 

631 # Use third-party character set detection to guess at the 

632 # encoding. 

633 if self.chardet_encoding is None: 

634 self.chardet_encoding = _chardet_dammit(self.markup) 

635 if self.chardet_encoding is not None and self._usable( 

636 self.chardet_encoding, tried 

637 ): 

638 yield self.chardet_encoding 

639 

640 # As a last-ditch effort, try utf-8 and windows-1252. 

641 for e in ("utf-8", "windows-1252"): 

642 if self._usable(e, tried): 

643 yield e 

644 

645 @classmethod 

646 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]: 

647 """If a byte-order mark is present, strip it and return the encoding it implies. 

648 

649 :param data: A bytestring that may or may not begin with a 

650 byte-order mark. 

651 

652 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark) 

653 """ 

654 encoding = None 

655 if isinstance(data, str): 

656 # Unicode data cannot have a byte-order mark. 

657 return data, encoding 

658 if ( 

659 (len(data) >= 4) 

660 and (data[:2] == b"\xfe\xff") 

661 and (data[2:4] != b"\x00\x00") 

662 ): 

663 encoding = "utf-16be" 

664 data = data[2:] 

665 elif ( 

666 (len(data) >= 4) 

667 and (data[:2] == b"\xff\xfe") 

668 and (data[2:4] != b"\x00\x00") 

669 ): 

670 encoding = "utf-16le" 

671 data = data[2:] 

672 elif data[:3] == b"\xef\xbb\xbf": 

673 encoding = "utf-8" 

674 data = data[3:] 

675 elif data[:4] == b"\x00\x00\xfe\xff": 

676 encoding = "utf-32be" 

677 data = data[4:] 

678 elif data[:4] == b"\xff\xfe\x00\x00": 

679 encoding = "utf-32le" 

680 data = data[4:] 

681 return data, encoding 

682 

683 @classmethod 

684 def find_declared_encoding( 

685 cls, 

686 markup: Union[bytes, str], 

687 is_html: bool = False, 

688 search_entire_document: bool = False, 

689 ) -> Optional[_Encoding]: 

690 """Given a document, tries to find an encoding declared within the 

691 text of the document itself. 

692 

693 An XML encoding is declared at the beginning of the document. 

694 

695 An HTML encoding is declared in a <meta> tag, hopefully near the 

696 beginning of the document. 

697 

698 :param markup: Some markup. 

699 :param is_html: If True, this markup is considered to be HTML. Otherwise 

700 it's assumed to be XML. 

701 :param search_entire_document: Since an encoding is supposed 

702 to declared near the beginning of the document, most of 

703 the time it's only necessary to search a few kilobytes of 

704 data. Set this to True to force this method to search the 

705 entire document. 

706 :return: The declared encoding, if one is found. 

707 """ 

708 if search_entire_document: 

709 xml_endpos = html_endpos = len(markup) 

710 else: 

711 xml_endpos = 1024 

712 html_endpos = max(2048, int(len(markup) * 0.05)) 

713 

714 if isinstance(markup, bytes): 

715 res = encoding_res[bytes] 

716 else: 

717 res = encoding_res[str] 

718 

719 xml_re = res["xml"] 

720 html_re = res["html"] 

721 declared_encoding: Optional[_Encoding] = None 

722 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) 

723 if not declared_encoding_match and is_html: 

724 declared_encoding_match = html_re.search(markup, endpos=html_endpos) 

725 if declared_encoding_match is not None: 

726 declared_encoding = declared_encoding_match.groups()[0] 

727 if declared_encoding: 

728 if isinstance(declared_encoding, bytes): 

729 declared_encoding = declared_encoding.decode("ascii", "replace") 

730 return declared_encoding.lower() 

731 return None 

732 

733 

734class UnicodeDammit: 

735 """A class for detecting the encoding of a bytestring containing an 

736 HTML or XML document, and decoding it to Unicode. If the source 

737 encoding is windows-1252, `UnicodeDammit` can also replace 

738 Microsoft smart quotes with their HTML or XML equivalents. 

739 

740 :param markup: HTML or XML markup in an unknown encoding. 

741 

742 :param known_definite_encodings: When determining the encoding 

743 of ``markup``, these encodings will be tried first, in 

744 order. In HTML terms, this corresponds to the "known 

745 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. 

746 

747 :param user_encodings: These encodings will be tried after the 

748 ``known_definite_encodings`` have been tried and failed, and 

749 after an attempt to sniff the encoding by looking at a 

750 byte order mark has failed. In HTML terms, this 

751 corresponds to the step "user has explicitly instructed 

752 the user agent to override the document's character 

753 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. 

754 

755 :param override_encodings: A **deprecated** alias for 

756 ``known_definite_encodings``. Any encodings here will be tried 

757 immediately after the encodings in 

758 ``known_definite_encodings``. 

759 

760 :param smart_quotes_to: By default, Microsoft smart quotes will, 

761 like all other characters, be converted to Unicode 

762 characters. Setting this to ``ascii`` will convert them to ASCII 

763 quotes instead. Setting it to ``xml`` will convert them to XML 

764 entity references, and setting it to ``html`` will convert them 

765 to HTML entity references. 

766 

767 :param is_html: If True, ``markup`` is treated as an HTML 

768 document. Otherwise it's treated as an XML document. 

769 

770 :param exclude_encodings: These encodings will not be considered, 

771 even if the sniffing code thinks they might make sense. 

772 

773 """ 

774 

775 def __init__( 

776 self, 

777 markup: bytes, 

778 known_definite_encodings: Optional[_Encodings] = [], 

779 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None, 

780 is_html: bool = False, 

781 exclude_encodings: Optional[_Encodings] = [], 

782 user_encodings: Optional[_Encodings] = None, 

783 override_encodings: Optional[_Encodings] = None, 

784 ): 

785 self.smart_quotes_to = smart_quotes_to 

786 self.tried_encodings = [] 

787 self.contains_replacement_characters = False 

788 self.is_html = is_html 

789 self.log = getLogger(__name__) 

790 self.detector = EncodingDetector( 

791 markup, 

792 known_definite_encodings, 

793 is_html, 

794 exclude_encodings, 

795 user_encodings, 

796 override_encodings, 

797 ) 

798 

799 # Short-circuit if the data is in Unicode to begin with. 

800 if isinstance(markup, str) or markup == b"": 

801 self.markup = markup 

802 self.unicode_markup = str(markup) 

803 self.original_encoding = None 

804 return 

805 

806 # The encoding detector may have stripped a byte-order mark. 

807 # Use the stripped markup from this point on. 

808 self.markup = self.detector.markup 

809 

810 u = None 

811 for encoding in self.detector.encodings: 

812 markup = self.detector.markup 

813 u = self._convert_from(encoding) 

814 if u is not None: 

815 break 

816 

817 if not u: 

818 # None of the encodings worked. As an absolute last resort, 

819 # try them again with character replacement. 

820 

821 for encoding in self.detector.encodings: 

822 if encoding != "ascii": 

823 u = self._convert_from(encoding, "replace") 

824 if u is not None: 

825 self.log.warning( 

826 "Some characters could not be decoded, and were " 

827 "replaced with REPLACEMENT CHARACTER." 

828 ) 

829 

830 self.contains_replacement_characters = True 

831 break 

832 

833 # If none of that worked, we could at this point force it to 

834 # ASCII, but that would destroy so much data that I think 

835 # giving up is better. 

836 # 

837 # Note that this is extremely unlikely, probably impossible, 

838 # because the "replace" strategy is so powerful. Even running 

839 # the Python binary through Unicode, Dammit gives you Unicode, 

840 # albeit Unicode riddled with REPLACEMENT CHARACTER. 

841 if u is None: 

842 self.original_encoding = None 

843 self.unicode_markup = None 

844 else: 

845 self.unicode_markup = u 

846 

847 #: The original markup, before it was converted to Unicode. 

848 #: This is not necessarily the same as what was passed in to the 

849 #: constructor, since any byte-order mark will be stripped. 

850 markup: bytes 

851 

852 #: The Unicode version of the markup, following conversion. This 

853 #: is set to None if there was simply no way to convert the 

854 #: bytestring to Unicode (as with binary data). 

855 unicode_markup: Optional[str] 

856 

857 #: This is True if `UnicodeDammit.unicode_markup` contains 

858 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present 

859 #: in `UnicodeDammit.markup`. These mark character sequences that 

860 #: could not be represented in Unicode. 

861 contains_replacement_characters: bool 

862 

863 #: Unicode, Dammit's best guess as to the original character 

864 #: encoding of `UnicodeDammit.markup`. 

865 original_encoding: Optional[_Encoding] 

866 

867 #: The strategy used to handle Microsoft smart quotes. 

868 smart_quotes_to: Optional[str] 

869 

870 #: The (encoding, error handling strategy) 2-tuples that were used to 

871 #: try and convert the markup to Unicode. 

872 tried_encodings: List[Tuple[_Encoding, str]] 

873 

874 log: Logger #: :meta private: 

875 

876 def _sub_ms_char(self, match: re.Match) -> bytes: 

877 """Changes a MS smart quote character to an XML or HTML 

878 entity, or an ASCII character. 

879 

880 TODO: Since this is only used to convert smart quotes, it 

881 could be simplified, and MS_CHARS_TO_ASCII made much less 

882 parochial. 

883 """ 

884 orig: bytes = match.group(1) 

885 sub: bytes 

886 if self.smart_quotes_to == "ascii": 

887 if orig in self.MS_CHARS_TO_ASCII: 

888 sub = self.MS_CHARS_TO_ASCII[orig].encode() 

889 else: 

890 # Shouldn't happen; substitute the character 

891 # with itself. 

892 sub = orig 

893 else: 

894 if orig in self.MS_CHARS: 

895 substitutions = self.MS_CHARS[orig] 

896 if type(substitutions) is tuple: 

897 if self.smart_quotes_to == "xml": 

898 sub = b"&#x" + substitutions[1].encode() + b";" 

899 else: 

900 sub = b"&" + substitutions[0].encode() + b";" 

901 else: 

902 substitutions = cast(str, substitutions) 

903 sub = substitutions.encode() 

904 else: 

905 # Shouldn't happen; substitute the character 

906 # for itself. 

907 sub = orig 

908 return sub 

909 

910 #: This dictionary maps commonly seen values for "charset" in HTML 

911 #: meta tags to the corresponding Python codec names. It only covers 

912 #: values that aren't in Python's aliases and can't be determined 

913 #: by the heuristics in `find_codec`. 

914 #: 

915 #: :meta hide-value: 

916 CHARSET_ALIASES: Dict[str, _Encoding] = { 

917 "macintosh": "mac-roman", 

918 "x-sjis": "shift-jis", 

919 } 

920 

921 #: A list of encodings that tend to contain Microsoft smart quotes. 

922 #: 

923 #: :meta hide-value: 

924 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [ 

925 "windows-1252", 

926 "iso-8859-1", 

927 "iso-8859-2", 

928 ] 

929 

930 def _convert_from( 

931 self, proposed: _Encoding, errors: str = "strict" 

932 ) -> Optional[str]: 

933 """Attempt to convert the markup to the proposed encoding. 

934 

935 :param proposed: The name of a character encoding. 

936 :param errors: An error handling strategy, used when calling `str`. 

937 :return: The converted markup, or `None` if the proposed 

938 encoding/error handling strategy didn't work. 

939 """ 

940 lookup_result = self.find_codec(proposed) 

941 if lookup_result is None or (lookup_result, errors) in self.tried_encodings: 

942 return None 

943 proposed = lookup_result 

944 self.tried_encodings.append((proposed, errors)) 

945 markup = self.markup 

946 # Convert smart quotes to HTML if coming from an encoding 

947 # that might have them. 

948 if ( 

949 self.smart_quotes_to is not None 

950 and proposed in self.ENCODINGS_WITH_SMART_QUOTES 

951 ): 

952 smart_quotes_re = b"([\x80-\x9f])" 

953 smart_quotes_compiled = re.compile(smart_quotes_re) 

954 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 

955 

956 try: 

957 # print("Trying to convert document to %s (errors=%s)" % ( 

958 # proposed, errors)) 

959 u = self._to_unicode(markup, proposed, errors) 

960 self.unicode_markup = u 

961 self.original_encoding = proposed 

962 except Exception: 

963 # print("That didn't work!") 

964 # print(e) 

965 return None 

966 # print("Correct encoding: %s" % proposed) 

967 return self.unicode_markup 

968 

969 def _to_unicode( 

970 self, data: bytes, encoding: _Encoding, errors: str = "strict" 

971 ) -> str: 

972 """Given a bytestring and its encoding, decodes the string into Unicode. 

973 

974 :param encoding: The name of an encoding. 

975 :param errors: An error handling strategy, used when calling `str`. 

976 """ 

977 return str(data, encoding, errors) 

978 

979 @property 

980 def declared_html_encoding(self) -> Optional[_Encoding]: 

981 """If the markup is an HTML document, returns the encoding, if any, 

982 declared *inside* the document. 

983 """ 

984 if not self.is_html: 

985 return None 

986 return self.detector.declared_encoding 

987 

988 def find_codec(self, charset: _Encoding) -> Optional[str]: 

989 """Look up the Python codec corresponding to a given character set. 

990 

991 :param charset: The name of a character set. 

992 :return: The name of a Python codec. 

993 """ 

994 value = ( 

995 self._codec(self.CHARSET_ALIASES.get(charset, charset)) 

996 or (charset and self._codec(charset.replace("-", ""))) 

997 or (charset and self._codec(charset.replace("-", "_"))) 

998 or (charset and charset.lower()) 

999 or charset 

1000 ) 

1001 if value: 

1002 return value.lower() 

1003 return None 

1004 

1005 def _codec(self, charset: _Encoding) -> Optional[str]: 

1006 if not charset: 

1007 return charset 

1008 codec = None 

1009 try: 

1010 codecs.lookup(charset) 

1011 codec = charset 

1012 except (LookupError, ValueError): 

1013 pass 

1014 return codec 

1015 

1016 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 

1017 #: 

1018 #: :meta hide-value: 

1019 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = { 

1020 b"\x80": ("euro", "20AC"), 

1021 b"\x81": " ", 

1022 b"\x82": ("sbquo", "201A"), 

1023 b"\x83": ("fnof", "192"), 

1024 b"\x84": ("bdquo", "201E"), 

1025 b"\x85": ("hellip", "2026"), 

1026 b"\x86": ("dagger", "2020"), 

1027 b"\x87": ("Dagger", "2021"), 

1028 b"\x88": ("circ", "2C6"), 

1029 b"\x89": ("permil", "2030"), 

1030 b"\x8a": ("Scaron", "160"), 

1031 b"\x8b": ("lsaquo", "2039"), 

1032 b"\x8c": ("OElig", "152"), 

1033 b"\x8d": "?", 

1034 b"\x8e": ("#x17D", "17D"), 

1035 b"\x8f": "?", 

1036 b"\x90": "?", 

1037 b"\x91": ("lsquo", "2018"), 

1038 b"\x92": ("rsquo", "2019"), 

1039 b"\x93": ("ldquo", "201C"), 

1040 b"\x94": ("rdquo", "201D"), 

1041 b"\x95": ("bull", "2022"), 

1042 b"\x96": ("ndash", "2013"), 

1043 b"\x97": ("mdash", "2014"), 

1044 b"\x98": ("tilde", "2DC"), 

1045 b"\x99": ("trade", "2122"), 

1046 b"\x9a": ("scaron", "161"), 

1047 b"\x9b": ("rsaquo", "203A"), 

1048 b"\x9c": ("oelig", "153"), 

1049 b"\x9d": "?", 

1050 b"\x9e": ("#x17E", "17E"), 

1051 b"\x9f": ("Yuml", ""), 

1052 } 

1053 

1054 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 

1055 #: horrors like stripping diacritical marks to turn á into a, but also 

1056 #: contains non-horrors like turning “ into ". 

1057 #: 

1058 #: Seriously, don't use this for anything other than removing smart 

1059 #: quotes. 

1060 #: 

1061 #: :meta private: 

1062 MS_CHARS_TO_ASCII: Dict[bytes, str] = { 

1063 b"\x80": "EUR", 

1064 b"\x81": " ", 

1065 b"\x82": ",", 

1066 b"\x83": "f", 

1067 b"\x84": ",,", 

1068 b"\x85": "...", 

1069 b"\x86": "+", 

1070 b"\x87": "++", 

1071 b"\x88": "^", 

1072 b"\x89": "%", 

1073 b"\x8a": "S", 

1074 b"\x8b": "<", 

1075 b"\x8c": "OE", 

1076 b"\x8d": "?", 

1077 b"\x8e": "Z", 

1078 b"\x8f": "?", 

1079 b"\x90": "?", 

1080 b"\x91": "'", 

1081 b"\x92": "'", 

1082 b"\x93": '"', 

1083 b"\x94": '"', 

1084 b"\x95": "*", 

1085 b"\x96": "-", 

1086 b"\x97": "--", 

1087 b"\x98": "~", 

1088 b"\x99": "(TM)", 

1089 b"\x9a": "s", 

1090 b"\x9b": ">", 

1091 b"\x9c": "oe", 

1092 b"\x9d": "?", 

1093 b"\x9e": "z", 

1094 b"\x9f": "Y", 

1095 b"\xa0": " ", 

1096 b"\xa1": "!", 

1097 b"\xa2": "c", 

1098 b"\xa3": "GBP", 

1099 b"\xa4": "$", # This approximation is especially parochial--this is the 

1100 # generic currency symbol. 

1101 b"\xa5": "YEN", 

1102 b"\xa6": "|", 

1103 b"\xa7": "S", 

1104 b"\xa8": "..", 

1105 b"\xa9": "", 

1106 b"\xaa": "(th)", 

1107 b"\xab": "<<", 

1108 b"\xac": "!", 

1109 b"\xad": " ", 

1110 b"\xae": "(R)", 

1111 b"\xaf": "-", 

1112 b"\xb0": "o", 

1113 b"\xb1": "+-", 

1114 b"\xb2": "2", 

1115 b"\xb3": "3", 

1116 b"\xb4": "'", 

1117 b"\xb5": "u", 

1118 b"\xb6": "P", 

1119 b"\xb7": "*", 

1120 b"\xb8": ",", 

1121 b"\xb9": "1", 

1122 b"\xba": "(th)", 

1123 b"\xbb": ">>", 

1124 b"\xbc": "1/4", 

1125 b"\xbd": "1/2", 

1126 b"\xbe": "3/4", 

1127 b"\xbf": "?", 

1128 b"\xc0": "A", 

1129 b"\xc1": "A", 

1130 b"\xc2": "A", 

1131 b"\xc3": "A", 

1132 b"\xc4": "A", 

1133 b"\xc5": "A", 

1134 b"\xc6": "AE", 

1135 b"\xc7": "C", 

1136 b"\xc8": "E", 

1137 b"\xc9": "E", 

1138 b"\xca": "E", 

1139 b"\xcb": "E", 

1140 b"\xcc": "I", 

1141 b"\xcd": "I", 

1142 b"\xce": "I", 

1143 b"\xcf": "I", 

1144 b"\xd0": "D", 

1145 b"\xd1": "N", 

1146 b"\xd2": "O", 

1147 b"\xd3": "O", 

1148 b"\xd4": "O", 

1149 b"\xd5": "O", 

1150 b"\xd6": "O", 

1151 b"\xd7": "*", 

1152 b"\xd8": "O", 

1153 b"\xd9": "U", 

1154 b"\xda": "U", 

1155 b"\xdb": "U", 

1156 b"\xdc": "U", 

1157 b"\xdd": "Y", 

1158 b"\xde": "b", 

1159 b"\xdf": "B", 

1160 b"\xe0": "a", 

1161 b"\xe1": "a", 

1162 b"\xe2": "a", 

1163 b"\xe3": "a", 

1164 b"\xe4": "a", 

1165 b"\xe5": "a", 

1166 b"\xe6": "ae", 

1167 b"\xe7": "c", 

1168 b"\xe8": "e", 

1169 b"\xe9": "e", 

1170 b"\xea": "e", 

1171 b"\xeb": "e", 

1172 b"\xec": "i", 

1173 b"\xed": "i", 

1174 b"\xee": "i", 

1175 b"\xef": "i", 

1176 b"\xf0": "o", 

1177 b"\xf1": "n", 

1178 b"\xf2": "o", 

1179 b"\xf3": "o", 

1180 b"\xf4": "o", 

1181 b"\xf5": "o", 

1182 b"\xf6": "o", 

1183 b"\xf7": "/", 

1184 b"\xf8": "o", 

1185 b"\xf9": "u", 

1186 b"\xfa": "u", 

1187 b"\xfb": "u", 

1188 b"\xfc": "u", 

1189 b"\xfd": "y", 

1190 b"\xfe": "b", 

1191 b"\xff": "y", 

1192 } 

1193 

1194 #: A map used when removing rogue Windows-1252/ISO-8859-1 

1195 #: characters in otherwise UTF-8 documents. 

1196 #: 

1197 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in 

1198 #: Windows-1252. 

1199 #: 

1200 #: :meta hide-value: 

1201 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = { 

1202 0x80: b"\xe2\x82\xac", # € 

1203 0x82: b"\xe2\x80\x9a", # ‚ 

1204 0x83: b"\xc6\x92", # ƒ 

1205 0x84: b"\xe2\x80\x9e", # „ 

1206 0x85: b"\xe2\x80\xa6", # … 

1207 0x86: b"\xe2\x80\xa0", # † 

1208 0x87: b"\xe2\x80\xa1", # ‡ 

1209 0x88: b"\xcb\x86", # ˆ 

1210 0x89: b"\xe2\x80\xb0", # ‰ 

1211 0x8A: b"\xc5\xa0", # Š 

1212 0x8B: b"\xe2\x80\xb9", # ‹ 

1213 0x8C: b"\xc5\x92", # Œ 

1214 0x8E: b"\xc5\xbd", # Ž 

1215 0x91: b"\xe2\x80\x98", # ‘ 

1216 0x92: b"\xe2\x80\x99", # ’ 

1217 0x93: b"\xe2\x80\x9c", # “ 

1218 0x94: b"\xe2\x80\x9d", # ” 

1219 0x95: b"\xe2\x80\xa2", # • 

1220 0x96: b"\xe2\x80\x93", # – 

1221 0x97: b"\xe2\x80\x94", # — 

1222 0x98: b"\xcb\x9c", # ˜ 

1223 0x99: b"\xe2\x84\xa2", # ™ 

1224 0x9A: b"\xc5\xa1", # š 

1225 0x9B: b"\xe2\x80\xba", # › 

1226 0x9C: b"\xc5\x93", # œ 

1227 0x9E: b"\xc5\xbe", # ž 

1228 0x9F: b"\xc5\xb8", # Ÿ 

1229 0xA0: b"\xc2\xa0", # 

1230 0xA1: b"\xc2\xa1", # ¡ 

1231 0xA2: b"\xc2\xa2", # ¢ 

1232 0xA3: b"\xc2\xa3", # £ 

1233 0xA4: b"\xc2\xa4", # ¤ 

1234 0xA5: b"\xc2\xa5", # ¥ 

1235 0xA6: b"\xc2\xa6", # ¦ 

1236 0xA7: b"\xc2\xa7", # § 

1237 0xA8: b"\xc2\xa8", # ¨ 

1238 0xA9: b"\xc2\xa9", # © 

1239 0xAA: b"\xc2\xaa", # ª 

1240 0xAB: b"\xc2\xab", # « 

1241 0xAC: b"\xc2\xac", # ¬ 

1242 0xAD: b"\xc2\xad", # ­ 

1243 0xAE: b"\xc2\xae", # ® 

1244 0xAF: b"\xc2\xaf", # ¯ 

1245 0xB0: b"\xc2\xb0", # ° 

1246 0xB1: b"\xc2\xb1", # ± 

1247 0xB2: b"\xc2\xb2", # ² 

1248 0xB3: b"\xc2\xb3", # ³ 

1249 0xB4: b"\xc2\xb4", # ´ 

1250 0xB5: b"\xc2\xb5", # µ 

1251 0xB6: b"\xc2\xb6", # ¶ 

1252 0xB7: b"\xc2\xb7", # · 

1253 0xB8: b"\xc2\xb8", # ¸ 

1254 0xB9: b"\xc2\xb9", # ¹ 

1255 0xBA: b"\xc2\xba", # º 

1256 0xBB: b"\xc2\xbb", # » 

1257 0xBC: b"\xc2\xbc", # ¼ 

1258 0xBD: b"\xc2\xbd", # ½ 

1259 0xBE: b"\xc2\xbe", # ¾ 

1260 0xBF: b"\xc2\xbf", # ¿ 

1261 0xC0: b"\xc3\x80", # À 

1262 0xC1: b"\xc3\x81", # Á 

1263 0xC2: b"\xc3\x82", #  

1264 0xC3: b"\xc3\x83", # à

1265 0xC4: b"\xc3\x84", # Ä 

1266 0xC5: b"\xc3\x85", # Å 

1267 0xC6: b"\xc3\x86", # Æ 

1268 0xC7: b"\xc3\x87", # Ç 

1269 0xC8: b"\xc3\x88", # È 

1270 0xC9: b"\xc3\x89", # É 

1271 0xCA: b"\xc3\x8a", # Ê 

1272 0xCB: b"\xc3\x8b", # Ë 

1273 0xCC: b"\xc3\x8c", # Ì 

1274 0xCD: b"\xc3\x8d", # Í 

1275 0xCE: b"\xc3\x8e", # Π

1276 0xCF: b"\xc3\x8f", # Ï 

1277 0xD0: b"\xc3\x90", # Р

1278 0xD1: b"\xc3\x91", # Ñ 

1279 0xD2: b"\xc3\x92", # Ò 

1280 0xD3: b"\xc3\x93", # Ó 

1281 0xD4: b"\xc3\x94", # Ô 

1282 0xD5: b"\xc3\x95", # Õ 

1283 0xD6: b"\xc3\x96", # Ö 

1284 0xD7: b"\xc3\x97", # × 

1285 0xD8: b"\xc3\x98", # Ø 

1286 0xD9: b"\xc3\x99", # Ù 

1287 0xDA: b"\xc3\x9a", # Ú 

1288 0xDB: b"\xc3\x9b", # Û 

1289 0xDC: b"\xc3\x9c", # Ü 

1290 0xDD: b"\xc3\x9d", # Ý 

1291 0xDE: b"\xc3\x9e", # Þ 

1292 0xDF: b"\xc3\x9f", # ß 

1293 0xE0: b"\xc3\xa0", # à 

1294 0xE1: b"\xa1", # á 

1295 0xE2: b"\xc3\xa2", # â 

1296 0xE3: b"\xc3\xa3", # ã 

1297 0xE4: b"\xc3\xa4", # ä 

1298 0xE5: b"\xc3\xa5", # å 

1299 0xE6: b"\xc3\xa6", # æ 

1300 0xE7: b"\xc3\xa7", # ç 

1301 0xE8: b"\xc3\xa8", # è 

1302 0xE9: b"\xc3\xa9", # é 

1303 0xEA: b"\xc3\xaa", # ê 

1304 0xEB: b"\xc3\xab", # ë 

1305 0xEC: b"\xc3\xac", # ì 

1306 0xED: b"\xc3\xad", # í 

1307 0xEE: b"\xc3\xae", # î 

1308 0xEF: b"\xc3\xaf", # ï 

1309 0xF0: b"\xc3\xb0", # ð 

1310 0xF1: b"\xc3\xb1", # ñ 

1311 0xF2: b"\xc3\xb2", # ò 

1312 0xF3: b"\xc3\xb3", # ó 

1313 0xF4: b"\xc3\xb4", # ô 

1314 0xF5: b"\xc3\xb5", # õ 

1315 0xF6: b"\xc3\xb6", # ö 

1316 0xF7: b"\xc3\xb7", # ÷ 

1317 0xF8: b"\xc3\xb8", # ø 

1318 0xF9: b"\xc3\xb9", # ù 

1319 0xFA: b"\xc3\xba", # ú 

1320 0xFB: b"\xc3\xbb", # û 

1321 0xFC: b"\xc3\xbc", # ü 

1322 0xFD: b"\xc3\xbd", # ý 

1323 0xFE: b"\xc3\xbe", # þ 

1324 } 

1325 

1326 #: :meta private: 

1327 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [ 

1328 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF 

1329 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF 

1330 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4 

1331 ] 

1332 

1333 #: :meta private: 

1334 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0] 

1335 

1336 #: :meta private: 

1337 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 

1338 

1339 @classmethod 

1340 def detwingle( 

1341 cls, 

1342 in_bytes: bytes, 

1343 main_encoding: _Encoding = "utf8", 

1344 embedded_encoding: _Encoding = "windows-1252", 

1345 ) -> bytes: 

1346 """Fix characters from one encoding embedded in some other encoding. 

1347 

1348 Currently the only situation supported is Windows-1252 (or its 

1349 subset ISO-8859-1), embedded in UTF-8. 

1350 

1351 :param in_bytes: A bytestring that you suspect contains 

1352 characters from multiple encodings. Note that this *must* 

1353 be a bytestring. If you've already converted the document 

1354 to Unicode, you're too late. 

1355 :param main_encoding: The primary encoding of ``in_bytes``. 

1356 :param embedded_encoding: The encoding that was used to embed characters 

1357 in the main document. 

1358 :return: A bytestring similar to ``in_bytes``, in which 

1359 ``embedded_encoding`` characters have been converted to 

1360 their ``main_encoding`` equivalents. 

1361 """ 

1362 if embedded_encoding.replace("_", "-").lower() not in ( 

1363 "windows-1252", 

1364 "windows_1252", 

1365 ): 

1366 raise NotImplementedError( 

1367 "Windows-1252 and ISO-8859-1 are the only currently supported " 

1368 "embedded encodings." 

1369 ) 

1370 

1371 if main_encoding.lower() not in ("utf8", "utf-8"): 

1372 raise NotImplementedError( 

1373 "UTF-8 is the only currently supported main encoding." 

1374 ) 

1375 

1376 byte_chunks = [] 

1377 

1378 chunk_start = 0 

1379 pos = 0 

1380 while pos < len(in_bytes): 

1381 byte = in_bytes[pos] 

1382 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER: 

1383 # This is the start of a UTF-8 multibyte character. Skip 

1384 # to the end. 

1385 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 

1386 if byte >= start and byte <= end: 

1387 pos += size 

1388 break 

1389 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 

1390 # We found a Windows-1252 character! 

1391 # Save the string up to this point as a chunk. 

1392 byte_chunks.append(in_bytes[chunk_start:pos]) 

1393 

1394 # Now translate the Windows-1252 character into UTF-8 

1395 # and add it as another, one-byte chunk. 

1396 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 

1397 pos += 1 

1398 chunk_start = pos 

1399 else: 

1400 # Go on to the next character. 

1401 pos += 1 

1402 if chunk_start == 0: 

1403 # The string is unchanged. 

1404 return in_bytes 

1405 else: 

1406 # Store the final chunk. 

1407 byte_chunks.append(in_bytes[chunk_start:]) 

1408 return b"".join(byte_chunks)