Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

408 statements  

1# -*- coding: utf-8 -*- 

2"""Beautiful Soup bonus library: Unicode, Dammit 

3 

4This library converts a bytestream to Unicode through any means 

5necessary. It is heavily based on code from Mark Pilgrim's `Universal 

6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained 

7by Kurt McKee. It does not rewrite the body of an XML or HTML document 

8to reflect a new encoding; that's the job of `TreeBuilder`. 

9 

10""" 

11 

12# Use of this source code is governed by the MIT license. 

13__license__ = "MIT" 

14 

15from html.entities import codepoint2name 

16from collections import defaultdict 

17import codecs 

18from html.entities import html5 

19import re 

20from logging import Logger, getLogger 

21from types import ModuleType 

22from typing import ( 

23 Dict, 

24 Iterator, 

25 List, 

26 Optional, 

27 Pattern, 

28 Set, 

29 Tuple, 

30 Type, 

31 Union, 

32 cast, 

33) 

34from typing_extensions import Literal 

35from bs4._typing import ( 

36 _Encoding, 

37 _Encodings, 

38) 

39import warnings 

40 

41# Import a library to autodetect character encodings. We'll support 

42# any of a number of libraries that all support the same API: 

43# 

44# * cchardet 

45# * chardet 

46# * charset-normalizer 

47chardet_module: Optional[ModuleType] = None 

48try: 

49 # PyPI package: cchardet 

50 import cchardet # type:ignore 

51 

52 chardet_module = cchardet 

53except ImportError: 

54 try: 

55 # Debian package: python-chardet 

56 # PyPI package: chardet 

57 import chardet 

58 

59 chardet_module = chardet 

60 except ImportError: 

61 try: 

62 # PyPI package: charset-normalizer 

63 import charset_normalizer # type:ignore 

64 

65 chardet_module = charset_normalizer 

66 except ImportError: 

67 # No chardet available. 

68 pass 

69 

70 

71def _chardet_dammit(s: bytes) -> Optional[str]: 

72 """Try as hard as possible to detect the encoding of a bytestring.""" 

73 if chardet_module is None or isinstance(s, str): 

74 return None 

75 module = chardet_module 

76 return module.detect(s)["encoding"] 

77 

78 

79# Build bytestring and Unicode versions of regular expressions for finding 

80# a declared encoding inside an XML or HTML document. 

81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private: 

82html_meta: str = ( 

83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private: 

84) 

85 

86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky. 

87encoding_res: Dict[Type, Dict[str, Pattern]] = dict() 

88encoding_res[bytes] = { 

89 "html": re.compile(html_meta.encode("ascii"), re.I), 

90 "xml": re.compile(xml_encoding.encode("ascii"), re.I), 

91} 

92encoding_res[str] = { 

93 "html": re.compile(html_meta, re.I), 

94 "xml": re.compile(xml_encoding, re.I), 

95} 

96 

97 

98class EntitySubstitutionMeta(type): 

99 """Provides lazy access to some data structures and regular 

100 expressions used by EntitySubstitution which have a measurable 

101 startup cost. 

102 """ 

103 # Trigger for 

104 _CLASS_VARIABLES_POPULATED: bool = False 

105 

106 @property 

107 def HTML_ENTITY_TO_CHARACTER(self) -> Dict[str, str]: 

108 """A mapping of entity names like "angmsdaa" to Unicode 

109 strings like "⦨". 

110 """ 

111 if not self._CLASS_VARIABLES_POPULATED: 

112 self._populate_class_variables() 

113 return self._HTML_ENTITY_TO_CHARACTER 

114 _HTML_ENTITY_TO_CHARACTER: Dict[str, str] 

115 

116 @property 

117 def CHARACTER_TO_HTML_ENTITY(self) -> Dict[str, str]: 

118 """A mapping of Unicode strings like "⦨" to entity names like 

119 "angmsdaa". When a single Unicode string has multiple entity 

120 names, we try to choose the most commonly-used name. 

121 """ 

122 if not self._CLASS_VARIABLES_POPULATED: 

123 self._populate_class_variables() 

124 return self._CHARACTER_TO_HTML_ENTITY 

125 _CHARACTER_TO_HTML_ENTITY: Dict[str, str] 

126 

127 @property 

128 def CHARACTER_TO_HTML_ENTITY_RE(self) -> Pattern[str]: 

129 """A regular expression matching (almost) any Unicode string 

130 that corresponds to an HTML5 named entity. 

131 """ 

132 

133 if not self._CLASS_VARIABLES_POPULATED: 

134 self._populate_class_variables() 

135 return self._CHARACTER_TO_HTML_ENTITY_RE 

136 _CHARACTER_TO_HTML_ENTITY_RE: Pattern[str] 

137 

138 @property 

139 def CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE(self) -> Pattern[str]: 

140 """A very similar regular expression to 

141 CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped 

142 ampersands. This is used by the 'html' formatter to provide 

143 backwards-compatibility, even though the HTML5 spec allows 

144 most ampersands to go unescaped. 

145 """ 

146 if not self._CLASS_VARIABLES_POPULATED: 

147 self._populate_class_variables() 

148 return self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE 

149 _CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str] 

150 

151 def _populate_class_variables(self) -> None: 

152 """Initialize variables used by EntitySubstitution to manage the plethora of 

153 HTML and HTML5 named entities. 

154 

155 This method populates the class variables necessary to make 

156 the properties defined in the metaclass work. 

157 """ 

158 if self._CLASS_VARIABLES_POPULATED: 

159 return 

160 unicode_to_name = {} 

161 name_to_unicode = {} 

162 

163 short_entities = set() 

164 long_entities_by_first_character = defaultdict(set) 

165 

166 for name_with_semicolon, character in sorted(html5.items()): 

167 # "It is intentional, for legacy compatibility, that many 

168 # code points have multiple character reference names. For 

169 # example, some appear both with and without the trailing 

170 # semicolon, or with different capitalizations." 

171 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references 

172 # 

173 # The parsers are in charge of handling (or not) character 

174 # references with no trailing semicolon, so we remove the 

175 # semicolon whenever it appears. 

176 if name_with_semicolon.endswith(";"): 

177 name = name_with_semicolon[:-1] 

178 else: 

179 name = name_with_semicolon 

180 

181 # When parsing HTML, we want to recognize any known named 

182 # entity and convert it to a sequence of Unicode 

183 # characters. 

184 if name not in name_to_unicode: 

185 name_to_unicode[name] = character 

186 

187 # When _generating_ HTML, we want to recognize special 

188 # character sequences that _could_ be converted to named 

189 # entities. 

190 unicode_to_name[character] = name 

191 

192 # We also need to build a regular expression that lets us 

193 # _find_ those characters in output strings so we can 

194 # replace them. 

195 # 

196 # This is tricky, for two reasons. 

197 

198 if len(character) == 1 and ord(character) < 128 and character not in "<>": 

199 # First, it would be annoying to turn single ASCII 

200 # characters like | into named entities like 

201 # &verbar;. The exceptions are <>, which we _must_ 

202 # turn into named entities to produce valid HTML. 

203 continue 

204 

205 if len(character) > 1 and all(ord(x) < 128 for x in character): 

206 # We also do not want to turn _combinations_ of ASCII 

207 # characters like 'fj' into named entities like '&fjlig;', 

208 # though that's more debateable. 

209 continue 

210 

211 # Second, some named entities have a Unicode value that's 

212 # a subset of the Unicode value for some _other_ named 

213 # entity. As an example, \u2267' is &GreaterFullEqual;, 

214 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular 

215 # expression needs to match the first two characters of 

216 # "\u2267\u0338foo", but only the first character of 

217 # "\u2267foo". 

218 # 

219 # In this step, we build two sets of characters that 

220 # _eventually_ need to go into the regular expression. But 

221 # we won't know exactly what the regular expression needs 

222 # to look like until we've gone through the entire list of 

223 # named entities. 

224 if len(character) == 1 and character != "&": 

225 short_entities.add(character) 

226 else: 

227 long_entities_by_first_character[character[0]].add(character) 

228 

229 # Now that we've been through the entire list of entities, we 

230 # can create a regular expression that matches any of them. 

231 particles = set() 

232 for short in short_entities: 

233 long_versions = long_entities_by_first_character[short] 

234 if not long_versions: 

235 particles.add(short) 

236 else: 

237 ignore = "".join([x[1] for x in long_versions]) 

238 # This finds, e.g. \u2267 but only if it is _not_ 

239 # followed by \u0338. 

240 particles.add("%s(?![%s])" % (short, ignore)) 

241 

242 for long_entities in list(long_entities_by_first_character.values()): 

243 for long_entity in long_entities: 

244 particles.add(long_entity) 

245 

246 re_definition = "(%s)" % "|".join(particles) 

247 

248 particles.add("&") 

249 re_definition_with_ampersand = "(%s)" % "|".join(particles) 

250 

251 # If an entity shows up in both html5 and codepoint2name, it's 

252 # likely that HTML5 gives it several different names, such as 

253 # 'rsquo' and 'rsquor'. When converting Unicode characters to 

254 # named entities, the codepoint2name name should take 

255 # precedence where possible, since that's the more easily 

256 # recognizable one. 

257 for codepoint, name in list(codepoint2name.items()): 

258 character = chr(codepoint) 

259 unicode_to_name[character] = name 

260 

261 self._CHARACTER_TO_HTML_ENTITY = unicode_to_name 

262 self._HTML_ENTITY_TO_CHARACTER = name_to_unicode 

263 self._CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) 

264 self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile( 

265 re_definition_with_ampersand 

266 ) 

267 self._CLASS_VARIABLES_POPULATED = True 

268 

269class EntitySubstitution(metaclass=EntitySubstitutionMeta): 

270 """The ability to substitute XML or HTML entities for certain characters.""" 

271 

272 #: A map of Unicode strings to the corresponding named XML entities. 

273 #: 

274 #: :meta hide-value: 

275 CHARACTER_TO_XML_ENTITY: Dict[str, str] = { 

276 "'": "apos", 

277 '"': "quot", 

278 "&": "amp", 

279 "<": "lt", 

280 ">": "gt", 

281 } 

282 

283 # Matches any named or numeric HTML entity. 

284 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I) 

285 

286 #: A regular expression matching an angle bracket or an ampersand that 

287 #: is not part of an XML or HTML entity. 

288 #: 

289 #: :meta hide-value: 

290 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile( 

291 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")" 

292 ) 

293 

294 #: A regular expression matching an angle bracket or an ampersand. 

295 #: 

296 #: :meta hide-value: 

297 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])") 

298 

299 @classmethod 

300 def _substitute_html_entity(cls, matchobj: re.Match) -> str: 

301 """Used with a regular expression to substitute the 

302 appropriate HTML entity for a special character string.""" 

303 original_entity = matchobj.group(0) 

304 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity) 

305 if entity is None: 

306 return "&amp;%s;" % original_entity 

307 return "&%s;" % entity 

308 

309 @classmethod 

310 def _substitute_xml_entity(cls, matchobj: re.Match) -> str: 

311 """Used with a regular expression to substitute the 

312 appropriate XML entity for a special character string.""" 

313 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 

314 return "&%s;" % entity 

315 

316 @classmethod 

317 def _escape_entity_name(cls, matchobj: re.Match) -> str: 

318 return "&amp;%s;" % matchobj.group(1) 

319 

320 @classmethod 

321 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str: 

322 possible_entity = matchobj.group(1) 

323 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER: 

324 return "&%s;" % possible_entity 

325 return "&amp;%s;" % possible_entity 

326 

327 @classmethod 

328 def quoted_attribute_value(cls, value: str) -> str: 

329 """Make a value into a quoted XML attribute, possibly escaping it. 

330 

331 Most strings will be quoted using double quotes. 

332 

333 Bob's Bar -> "Bob's Bar" 

334 

335 If a string contains double quotes, it will be quoted using 

336 single quotes. 

337 

338 Welcome to "my bar" -> 'Welcome to "my bar"' 

339 

340 If a string contains both single and double quotes, the 

341 double quotes will be escaped, and the string will be quoted 

342 using double quotes. 

343 

344 Welcome to "Bob's Bar" -> Welcome to &quot;Bob's bar&quot; 

345 

346 :param value: The XML attribute value to quote 

347 :return: The quoted value 

348 """ 

349 quote_with = '"' 

350 if '"' in value: 

351 if "'" in value: 

352 # The string contains both single and double 

353 # quotes. Turn the double quotes into 

354 # entities. We quote the double quotes rather than 

355 # the single quotes because the entity name is 

356 # "&quot;" whether this is HTML or XML. If we 

357 # quoted the single quotes, we'd have to decide 

358 # between &apos; and &squot;. 

359 replace_with = "&quot;" 

360 value = value.replace('"', replace_with) 

361 else: 

362 # There are double quotes but no single quotes. 

363 # We can use single quotes to quote the attribute. 

364 quote_with = "'" 

365 return quote_with + value + quote_with 

366 

367 @classmethod 

368 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str: 

369 """Replace special XML characters with named XML entities. 

370 

371 The less-than sign will become &lt;, the greater-than sign 

372 will become &gt;, and any ampersands will become &amp;. If you 

373 want ampersands that seem to be part of an entity definition 

374 to be left alone, use `substitute_xml_containing_entities` 

375 instead. 

376 

377 :param value: A string to be substituted. 

378 

379 :param make_quoted_attribute: If True, then the string will be 

380 quoted, as befits an attribute value. 

381 

382 :return: A version of ``value`` with special characters replaced 

383 with named entities. 

384 """ 

385 # Escape angle brackets and ampersands. 

386 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) 

387 

388 if make_quoted_attribute: 

389 value = cls.quoted_attribute_value(value) 

390 return value 

391 

392 @classmethod 

393 def substitute_xml_containing_entities( 

394 cls, value: str, make_quoted_attribute: bool = False 

395 ) -> str: 

396 """Substitute XML entities for special XML characters. 

397 

398 :param value: A string to be substituted. The less-than sign will 

399 become &lt;, the greater-than sign will become &gt;, and any 

400 ampersands that are not part of an entity defition will 

401 become &amp;. 

402 

403 :param make_quoted_attribute: If True, then the string will be 

404 quoted, as befits an attribute value. 

405 """ 

406 # Escape angle brackets, and ampersands that aren't part of 

407 # entities. 

408 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) 

409 

410 if make_quoted_attribute: 

411 value = cls.quoted_attribute_value(value) 

412 return value 

413 

414 @classmethod 

415 def substitute_html(cls, s: str) -> str: 

416 """Replace certain Unicode characters with named HTML entities. 

417 

418 This differs from ``data.encode(encoding, 'xmlcharrefreplace')`` 

419 in that the goal is to make the result more readable (to those 

420 with ASCII displays) rather than to recover from 

421 errors. There's absolutely nothing wrong with a UTF-8 string 

422 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 

423 character with "&eacute;" will make it more readable to some 

424 people. 

425 

426 :param s: The string to be modified. 

427 :return: The string with some Unicode characters replaced with 

428 HTML entities. 

429 """ 

430 # Convert any appropriate characters to HTML entities. 

431 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub( 

432 cls._substitute_html_entity, s 

433 ) 

434 

435 @classmethod 

436 def substitute_html5(cls, s: str) -> str: 

437 """Replace certain Unicode characters with named HTML entities 

438 using HTML5 rules. 

439 

440 Specifically, this method is much less aggressive about 

441 escaping ampersands than substitute_html. Only ambiguous 

442 ampersands are escaped, per the HTML5 standard: 

443 

444 "An ambiguous ampersand is a U+0026 AMPERSAND character (&) 

445 that is followed by one or more ASCII alphanumerics, followed 

446 by a U+003B SEMICOLON character (;), where these characters do 

447 not match any of the names given in the named character 

448 references section." 

449 

450 Unlike substitute_html5_raw, this method assumes HTML entities 

451 were converted to Unicode characters on the way in, as 

452 Beautiful Soup does. By the time Beautiful Soup does its work, 

453 the only ambiguous ampersands that need to be escaped are the 

454 ones that were escaped in the original markup when mentioning 

455 HTML entities. 

456 

457 :param s: The string to be modified. 

458 :return: The string with some Unicode characters replaced with 

459 HTML entities. 

460 """ 

461 # First, escape any HTML entities found in the markup. 

462 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s) 

463 

464 # Next, convert any appropriate characters to unescaped HTML entities. 

465 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) 

466 

467 return s 

468 

469 @classmethod 

470 def substitute_html5_raw(cls, s: str) -> str: 

471 """Replace certain Unicode characters with named HTML entities 

472 using HTML5 rules. 

473 

474 substitute_html5_raw is similar to substitute_html5 but it is 

475 designed for standalone use (whereas substitute_html5 is 

476 designed for use with Beautiful Soup). 

477 

478 :param s: The string to be modified. 

479 :return: The string with some Unicode characters replaced with 

480 HTML entities. 

481 """ 

482 # First, escape the ampersand for anything that looks like an 

483 # entity but isn't in the list of recognized entities. All other 

484 # ampersands can be left alone. 

485 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s) 

486 

487 # Then, convert a range of Unicode characters to unescaped 

488 # HTML entities. 

489 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) 

490 

491 return s 

492 

493 

494class EncodingDetector: 

495 """This class is capable of guessing a number of possible encodings 

496 for a bytestring. 

497 

498 Order of precedence: 

499 

500 1. Encodings you specifically tell EncodingDetector to try first 

501 (the ``known_definite_encodings`` argument to the constructor). 

502 

503 2. An encoding determined by sniffing the document's byte-order mark. 

504 

505 3. Encodings you specifically tell EncodingDetector to try if 

506 byte-order mark sniffing fails (the ``user_encodings`` argument to the 

507 constructor). 

508 

509 4. An encoding declared within the bytestring itself, either in an 

510 XML declaration (if the bytestring is to be interpreted as an XML 

511 document), or in a <meta> tag (if the bytestring is to be 

512 interpreted as an HTML document.) 

513 

514 5. An encoding detected through textual analysis by chardet, 

515 cchardet, or a similar external library. 

516 

517 6. UTF-8. 

518 

519 7. Windows-1252. 

520 

521 :param markup: Some markup in an unknown encoding. 

522 

523 :param known_definite_encodings: When determining the encoding 

524 of ``markup``, these encodings will be tried first, in 

525 order. In HTML terms, this corresponds to the "known 

526 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. 

527 

528 :param user_encodings: These encodings will be tried after the 

529 ``known_definite_encodings`` have been tried and failed, and 

530 after an attempt to sniff the encoding by looking at a 

531 byte order mark has failed. In HTML terms, this 

532 corresponds to the step "user has explicitly instructed 

533 the user agent to override the document's character 

534 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. 

535 

536 :param override_encodings: A **deprecated** alias for 

537 ``known_definite_encodings``. Any encodings here will be tried 

538 immediately after the encodings in 

539 ``known_definite_encodings``. 

540 

541 :param is_html: If True, this markup is considered to be 

542 HTML. Otherwise it's assumed to be XML. 

543 

544 :param exclude_encodings: These encodings will not be tried, 

545 even if they otherwise would be. 

546 

547 """ 

548 

549 def __init__( 

550 self, 

551 markup: bytes, 

552 known_definite_encodings: Optional[_Encodings] = None, 

553 is_html: Optional[bool] = False, 

554 exclude_encodings: Optional[_Encodings] = None, 

555 user_encodings: Optional[_Encodings] = None, 

556 override_encodings: Optional[_Encodings] = None, 

557 ): 

558 self.known_definite_encodings = list(known_definite_encodings or []) 

559 if override_encodings: 

560 warnings.warn( 

561 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.", 

562 DeprecationWarning, 

563 stacklevel=3, 

564 ) 

565 self.known_definite_encodings += override_encodings 

566 self.user_encodings = user_encodings or [] 

567 exclude_encodings = exclude_encodings or [] 

568 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 

569 self.chardet_encoding = None 

570 self.is_html = False if is_html is None else is_html 

571 self.declared_encoding: Optional[str] = None 

572 

573 # First order of business: strip a byte-order mark. 

574 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 

575 

576 known_definite_encodings: _Encodings 

577 user_encodings: _Encodings 

578 exclude_encodings: _Encodings 

579 chardet_encoding: Optional[_Encoding] 

580 is_html: bool 

581 declared_encoding: Optional[_Encoding] 

582 markup: bytes 

583 sniffed_encoding: Optional[_Encoding] 

584 

585 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool: 

586 """Should we even bother to try this encoding? 

587 

588 :param encoding: Name of an encoding. 

589 :param tried: Encodings that have already been tried. This 

590 will be modified as a side effect. 

591 """ 

592 if encoding is None: 

593 return False 

594 encoding = encoding.lower() 

595 if encoding in self.exclude_encodings: 

596 return False 

597 if encoding not in tried: 

598 tried.add(encoding) 

599 return True 

600 return False 

601 

602 @property 

603 def encodings(self) -> Iterator[_Encoding]: 

604 """Yield a number of encodings that might work for this markup. 

605 

606 :yield: A sequence of strings. Each is the name of an encoding 

607 that *might* work to convert a bytestring into Unicode. 

608 """ 

609 tried: Set[_Encoding] = set() 

610 

611 # First, try the known definite encodings 

612 for e in self.known_definite_encodings: 

613 if self._usable(e, tried): 

614 yield e 

615 

616 # Did the document originally start with a byte-order mark 

617 # that indicated its encoding? 

618 if self.sniffed_encoding is not None and self._usable( 

619 self.sniffed_encoding, tried 

620 ): 

621 yield self.sniffed_encoding 

622 

623 # Sniffing the byte-order mark did nothing; try the user 

624 # encodings. 

625 for e in self.user_encodings: 

626 if self._usable(e, tried): 

627 yield e 

628 

629 # Look within the document for an XML or HTML encoding 

630 # declaration. 

631 if self.declared_encoding is None: 

632 self.declared_encoding = self.find_declared_encoding( 

633 self.markup, self.is_html 

634 ) 

635 if self.declared_encoding is not None and self._usable( 

636 self.declared_encoding, tried 

637 ): 

638 yield self.declared_encoding 

639 

640 # Use third-party character set detection to guess at the 

641 # encoding. 

642 if self.chardet_encoding is None: 

643 self.chardet_encoding = _chardet_dammit(self.markup) 

644 if self.chardet_encoding is not None and self._usable( 

645 self.chardet_encoding, tried 

646 ): 

647 yield self.chardet_encoding 

648 

649 # As a last-ditch effort, try utf-8 and windows-1252. 

650 for e in ("utf-8", "windows-1252"): 

651 if self._usable(e, tried): 

652 yield e 

653 

654 @classmethod 

655 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]: 

656 """If a byte-order mark is present, strip it and return the encoding it implies. 

657 

658 :param data: A bytestring that may or may not begin with a 

659 byte-order mark. 

660 

661 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark) 

662 """ 

663 encoding = None 

664 if isinstance(data, str): 

665 # Unicode data cannot have a byte-order mark. 

666 return data, encoding 

667 if ( 

668 (len(data) >= 4) 

669 and (data[:2] == b"\xfe\xff") 

670 and (data[2:4] != b"\x00\x00") 

671 ): 

672 encoding = "utf-16be" 

673 data = data[2:] 

674 elif ( 

675 (len(data) >= 4) 

676 and (data[:2] == b"\xff\xfe") 

677 and (data[2:4] != b"\x00\x00") 

678 ): 

679 encoding = "utf-16le" 

680 data = data[2:] 

681 elif data[:3] == b"\xef\xbb\xbf": 

682 encoding = "utf-8" 

683 data = data[3:] 

684 elif data[:4] == b"\x00\x00\xfe\xff": 

685 encoding = "utf-32be" 

686 data = data[4:] 

687 elif data[:4] == b"\xff\xfe\x00\x00": 

688 encoding = "utf-32le" 

689 data = data[4:] 

690 return data, encoding 

691 

692 @classmethod 

693 def find_declared_encoding( 

694 cls, 

695 markup: Union[bytes, str], 

696 is_html: bool = False, 

697 search_entire_document: bool = False, 

698 ) -> Optional[_Encoding]: 

699 """Given a document, tries to find an encoding declared within the 

700 text of the document itself. 

701 

702 An XML encoding is declared at the beginning of the document. 

703 

704 An HTML encoding is declared in a <meta> tag, hopefully near the 

705 beginning of the document. 

706 

707 :param markup: Some markup. 

708 :param is_html: If True, this markup is considered to be HTML. Otherwise 

709 it's assumed to be XML. 

710 :param search_entire_document: Since an encoding is supposed 

711 to declared near the beginning of the document, most of 

712 the time it's only necessary to search a few kilobytes of 

713 data. Set this to True to force this method to search the 

714 entire document. 

715 :return: The declared encoding, if one is found. 

716 """ 

717 if search_entire_document: 

718 xml_endpos = html_endpos = len(markup) 

719 else: 

720 xml_endpos = 1024 

721 html_endpos = max(2048, int(len(markup) * 0.05)) 

722 

723 if isinstance(markup, bytes): 

724 res = encoding_res[bytes] 

725 else: 

726 res = encoding_res[str] 

727 

728 xml_re = res["xml"] 

729 html_re = res["html"] 

730 declared_encoding: Optional[_Encoding] = None 

731 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) 

732 if not declared_encoding_match and is_html: 

733 declared_encoding_match = html_re.search(markup, endpos=html_endpos) 

734 if declared_encoding_match is not None: 

735 declared_encoding = declared_encoding_match.groups()[0] 

736 if declared_encoding: 

737 if isinstance(declared_encoding, bytes): 

738 declared_encoding = declared_encoding.decode("ascii", "replace") 

739 return declared_encoding.lower() 

740 return None 

741 

742 

743class UnicodeDammit: 

744 """A class for detecting the encoding of a bytestring containing an 

745 HTML or XML document, and decoding it to Unicode. If the source 

746 encoding is windows-1252, `UnicodeDammit` can also replace 

747 Microsoft smart quotes with their HTML or XML equivalents. 

748 

749 :param markup: HTML or XML markup in an unknown encoding. 

750 

751 :param known_definite_encodings: When determining the encoding 

752 of ``markup``, these encodings will be tried first, in 

753 order. In HTML terms, this corresponds to the "known 

754 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. 

755 

756 :param user_encodings: These encodings will be tried after the 

757 ``known_definite_encodings`` have been tried and failed, and 

758 after an attempt to sniff the encoding by looking at a 

759 byte order mark has failed. In HTML terms, this 

760 corresponds to the step "user has explicitly instructed 

761 the user agent to override the document's character 

762 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. 

763 

764 :param override_encodings: A **deprecated** alias for 

765 ``known_definite_encodings``. Any encodings here will be tried 

766 immediately after the encodings in 

767 ``known_definite_encodings``. 

768 

769 :param smart_quotes_to: By default, Microsoft smart quotes will, 

770 like all other characters, be converted to Unicode 

771 characters. Setting this to ``ascii`` will convert them to ASCII 

772 quotes instead. Setting it to ``xml`` will convert them to XML 

773 entity references, and setting it to ``html`` will convert them 

774 to HTML entity references. 

775 

776 :param is_html: If True, ``markup`` is treated as an HTML 

777 document. Otherwise it's treated as an XML document. 

778 

779 :param exclude_encodings: These encodings will not be considered, 

780 even if the sniffing code thinks they might make sense. 

781 

782 """ 

783 

784 def __init__( 

785 self, 

786 markup: bytes, 

787 known_definite_encodings: Optional[_Encodings] = [], 

788 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None, 

789 is_html: bool = False, 

790 exclude_encodings: Optional[_Encodings] = [], 

791 user_encodings: Optional[_Encodings] = None, 

792 override_encodings: Optional[_Encodings] = None, 

793 ): 

794 self.smart_quotes_to = smart_quotes_to 

795 self.tried_encodings = [] 

796 self.contains_replacement_characters = False 

797 self.is_html = is_html 

798 self.log = getLogger(__name__) 

799 self.detector = EncodingDetector( 

800 markup, 

801 known_definite_encodings, 

802 is_html, 

803 exclude_encodings, 

804 user_encodings, 

805 override_encodings, 

806 ) 

807 

808 # Short-circuit if the data is in Unicode to begin with. 

809 if isinstance(markup, str): 

810 self.markup = markup.encode("utf8") 

811 self.unicode_markup = markup 

812 self.original_encoding = None 

813 return 

814 

815 # The encoding detector may have stripped a byte-order mark. 

816 # Use the stripped markup from this point on. 

817 self.markup = self.detector.markup 

818 

819 u = None 

820 for encoding in self.detector.encodings: 

821 markup = self.detector.markup 

822 u = self._convert_from(encoding) 

823 if u is not None: 

824 break 

825 

826 if not u: 

827 # None of the encodings worked. As an absolute last resort, 

828 # try them again with character replacement. 

829 

830 for encoding in self.detector.encodings: 

831 if encoding != "ascii": 

832 u = self._convert_from(encoding, "replace") 

833 if u is not None: 

834 self.log.warning( 

835 "Some characters could not be decoded, and were " 

836 "replaced with REPLACEMENT CHARACTER." 

837 ) 

838 

839 self.contains_replacement_characters = True 

840 break 

841 

842 # If none of that worked, we could at this point force it to 

843 # ASCII, but that would destroy so much data that I think 

844 # giving up is better. 

845 # 

846 # Note that this is extremely unlikely, probably impossible, 

847 # because the "replace" strategy is so powerful. Even running 

848 # the Python binary through Unicode, Dammit gives you Unicode, 

849 # albeit Unicode riddled with REPLACEMENT CHARACTER. 

850 if u is None: 

851 self.original_encoding = None 

852 self.unicode_markup = None 

853 else: 

854 self.unicode_markup = u 

855 

856 #: The original markup, before it was converted to Unicode. 

857 #: This is not necessarily the same as what was passed in to the 

858 #: constructor, since any byte-order mark will be stripped. 

859 markup: bytes 

860 

861 #: The Unicode version of the markup, following conversion. This 

862 #: is set to None if there was simply no way to convert the 

863 #: bytestring to Unicode (as with binary data). 

864 unicode_markup: Optional[str] 

865 

866 #: This is True if `UnicodeDammit.unicode_markup` contains 

867 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present 

868 #: in `UnicodeDammit.markup`. These mark character sequences that 

869 #: could not be represented in Unicode. 

870 contains_replacement_characters: bool 

871 

872 #: Unicode, Dammit's best guess as to the original character 

873 #: encoding of `UnicodeDammit.markup`. 

874 original_encoding: Optional[_Encoding] 

875 

876 #: The strategy used to handle Microsoft smart quotes. 

877 smart_quotes_to: Optional[str] 

878 

879 #: The (encoding, error handling strategy) 2-tuples that were used to 

880 #: try and convert the markup to Unicode. 

881 tried_encodings: List[Tuple[_Encoding, str]] 

882 

883 log: Logger #: :meta private: 

884 

885 def _sub_ms_char(self, match: re.Match) -> bytes: 

886 """Changes a MS smart quote character to an XML or HTML 

887 entity, or an ASCII character. 

888 

889 TODO: Since this is only used to convert smart quotes, it 

890 could be simplified, and MS_CHARS_TO_ASCII made much less 

891 parochial. 

892 """ 

893 orig: bytes = match.group(1) 

894 sub: bytes 

895 if self.smart_quotes_to == "ascii": 

896 if orig in self.MS_CHARS_TO_ASCII: 

897 sub = self.MS_CHARS_TO_ASCII[orig].encode() 

898 else: 

899 # Shouldn't happen; substitute the character 

900 # with itself. 

901 sub = orig 

902 else: 

903 if orig in self.MS_CHARS: 

904 substitutions = self.MS_CHARS[orig] 

905 if type(substitutions) is tuple: 

906 if self.smart_quotes_to == "xml": 

907 sub = b"&#x" + substitutions[1].encode() + b";" 

908 else: 

909 sub = b"&" + substitutions[0].encode() + b";" 

910 else: 

911 substitutions = cast(str, substitutions) 

912 sub = substitutions.encode() 

913 else: 

914 # Shouldn't happen; substitute the character 

915 # for itself. 

916 sub = orig 

917 return sub 

918 

919 #: This dictionary maps commonly seen values for "charset" in HTML 

920 #: meta tags to the corresponding Python codec names. It only covers 

921 #: values that aren't in Python's aliases and can't be determined 

922 #: by the heuristics in `find_codec`. 

923 #: 

924 #: :meta hide-value: 

925 CHARSET_ALIASES: Dict[str, _Encoding] = { 

926 "macintosh": "mac-roman", 

927 "x-sjis": "shift-jis", 

928 } 

929 

930 #: A list of encodings that tend to contain Microsoft smart quotes. 

931 #: 

932 #: :meta hide-value: 

933 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [ 

934 "windows-1252", 

935 "iso-8859-1", 

936 "iso-8859-2", 

937 ] 

938 

939 def _convert_from( 

940 self, proposed: _Encoding, errors: str = "strict" 

941 ) -> Optional[str]: 

942 """Attempt to convert the markup to the proposed encoding. 

943 

944 :param proposed: The name of a character encoding. 

945 :param errors: An error handling strategy, used when calling `str`. 

946 :return: The converted markup, or `None` if the proposed 

947 encoding/error handling strategy didn't work. 

948 """ 

949 lookup_result = self.find_codec(proposed) 

950 if lookup_result is None or (lookup_result, errors) in self.tried_encodings: 

951 return None 

952 proposed = lookup_result 

953 self.tried_encodings.append((proposed, errors)) 

954 markup = self.markup 

955 # Convert smart quotes to HTML if coming from an encoding 

956 # that might have them. 

957 if ( 

958 self.smart_quotes_to is not None 

959 and proposed in self.ENCODINGS_WITH_SMART_QUOTES 

960 ): 

961 smart_quotes_re = b"([\x80-\x9f])" 

962 smart_quotes_compiled = re.compile(smart_quotes_re) 

963 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 

964 

965 try: 

966 # print("Trying to convert document to %s (errors=%s)" % ( 

967 # proposed, errors)) 

968 u = self._to_unicode(markup, proposed, errors) 

969 self.unicode_markup = u 

970 self.original_encoding = proposed 

971 except Exception: 

972 # print("That didn't work!") 

973 # print(e) 

974 return None 

975 # print("Correct encoding: %s" % proposed) 

976 return self.unicode_markup 

977 

978 def _to_unicode( 

979 self, data: bytes, encoding: _Encoding, errors: str = "strict" 

980 ) -> str: 

981 """Given a bytestring and its encoding, decodes the string into Unicode. 

982 

983 :param encoding: The name of an encoding. 

984 :param errors: An error handling strategy, used when calling `str`. 

985 """ 

986 return str(data, encoding, errors) 

987 

988 @property 

989 def declared_html_encoding(self) -> Optional[_Encoding]: 

990 """If the markup is an HTML document, returns the encoding, if any, 

991 declared *inside* the document. 

992 """ 

993 if not self.is_html: 

994 return None 

995 return self.detector.declared_encoding 

996 

997 def find_codec(self, charset: _Encoding) -> Optional[str]: 

998 """Look up the Python codec corresponding to a given character set. 

999 

1000 :param charset: The name of a character set. 

1001 :return: The name of a Python codec. 

1002 """ 

1003 value = ( 

1004 self._codec(self.CHARSET_ALIASES.get(charset, charset)) 

1005 or (charset and self._codec(charset.replace("-", ""))) 

1006 or (charset and self._codec(charset.replace("-", "_"))) 

1007 or (charset and charset.lower()) 

1008 or charset 

1009 ) 

1010 if value: 

1011 return value.lower() 

1012 return None 

1013 

1014 def _codec(self, charset: _Encoding) -> Optional[str]: 

1015 if not charset: 

1016 return charset 

1017 codec = None 

1018 try: 

1019 codecs.lookup(charset) 

1020 codec = charset 

1021 except (LookupError, ValueError): 

1022 pass 

1023 return codec 

1024 

1025 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 

1026 #: 

1027 #: :meta hide-value: 

1028 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = { 

1029 b"\x80": ("euro", "20AC"), 

1030 b"\x81": " ", 

1031 b"\x82": ("sbquo", "201A"), 

1032 b"\x83": ("fnof", "192"), 

1033 b"\x84": ("bdquo", "201E"), 

1034 b"\x85": ("hellip", "2026"), 

1035 b"\x86": ("dagger", "2020"), 

1036 b"\x87": ("Dagger", "2021"), 

1037 b"\x88": ("circ", "2C6"), 

1038 b"\x89": ("permil", "2030"), 

1039 b"\x8a": ("Scaron", "160"), 

1040 b"\x8b": ("lsaquo", "2039"), 

1041 b"\x8c": ("OElig", "152"), 

1042 b"\x8d": "?", 

1043 b"\x8e": ("#x17D", "17D"), 

1044 b"\x8f": "?", 

1045 b"\x90": "?", 

1046 b"\x91": ("lsquo", "2018"), 

1047 b"\x92": ("rsquo", "2019"), 

1048 b"\x93": ("ldquo", "201C"), 

1049 b"\x94": ("rdquo", "201D"), 

1050 b"\x95": ("bull", "2022"), 

1051 b"\x96": ("ndash", "2013"), 

1052 b"\x97": ("mdash", "2014"), 

1053 b"\x98": ("tilde", "2DC"), 

1054 b"\x99": ("trade", "2122"), 

1055 b"\x9a": ("scaron", "161"), 

1056 b"\x9b": ("rsaquo", "203A"), 

1057 b"\x9c": ("oelig", "153"), 

1058 b"\x9d": "?", 

1059 b"\x9e": ("#x17E", "17E"), 

1060 b"\x9f": ("Yuml", ""), 

1061 } 

1062 

1063 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 

1064 #: horrors like stripping diacritical marks to turn á into a, but also 

1065 #: contains non-horrors like turning “ into ". 

1066 #: 

1067 #: Seriously, don't use this for anything other than removing smart 

1068 #: quotes. 

1069 #: 

1070 #: :meta private: 

1071 MS_CHARS_TO_ASCII: Dict[bytes, str] = { 

1072 b"\x80": "EUR", 

1073 b"\x81": " ", 

1074 b"\x82": ",", 

1075 b"\x83": "f", 

1076 b"\x84": ",,", 

1077 b"\x85": "...", 

1078 b"\x86": "+", 

1079 b"\x87": "++", 

1080 b"\x88": "^", 

1081 b"\x89": "%", 

1082 b"\x8a": "S", 

1083 b"\x8b": "<", 

1084 b"\x8c": "OE", 

1085 b"\x8d": "?", 

1086 b"\x8e": "Z", 

1087 b"\x8f": "?", 

1088 b"\x90": "?", 

1089 b"\x91": "'", 

1090 b"\x92": "'", 

1091 b"\x93": '"', 

1092 b"\x94": '"', 

1093 b"\x95": "*", 

1094 b"\x96": "-", 

1095 b"\x97": "--", 

1096 b"\x98": "~", 

1097 b"\x99": "(TM)", 

1098 b"\x9a": "s", 

1099 b"\x9b": ">", 

1100 b"\x9c": "oe", 

1101 b"\x9d": "?", 

1102 b"\x9e": "z", 

1103 b"\x9f": "Y", 

1104 b"\xa0": " ", 

1105 b"\xa1": "!", 

1106 b"\xa2": "c", 

1107 b"\xa3": "GBP", 

1108 b"\xa4": "$", # This approximation is especially parochial--this is the 

1109 # generic currency symbol. 

1110 b"\xa5": "YEN", 

1111 b"\xa6": "|", 

1112 b"\xa7": "S", 

1113 b"\xa8": "..", 

1114 b"\xa9": "", 

1115 b"\xaa": "(th)", 

1116 b"\xab": "<<", 

1117 b"\xac": "!", 

1118 b"\xad": " ", 

1119 b"\xae": "(R)", 

1120 b"\xaf": "-", 

1121 b"\xb0": "o", 

1122 b"\xb1": "+-", 

1123 b"\xb2": "2", 

1124 b"\xb3": "3", 

1125 b"\xb4": "'", 

1126 b"\xb5": "u", 

1127 b"\xb6": "P", 

1128 b"\xb7": "*", 

1129 b"\xb8": ",", 

1130 b"\xb9": "1", 

1131 b"\xba": "(th)", 

1132 b"\xbb": ">>", 

1133 b"\xbc": "1/4", 

1134 b"\xbd": "1/2", 

1135 b"\xbe": "3/4", 

1136 b"\xbf": "?", 

1137 b"\xc0": "A", 

1138 b"\xc1": "A", 

1139 b"\xc2": "A", 

1140 b"\xc3": "A", 

1141 b"\xc4": "A", 

1142 b"\xc5": "A", 

1143 b"\xc6": "AE", 

1144 b"\xc7": "C", 

1145 b"\xc8": "E", 

1146 b"\xc9": "E", 

1147 b"\xca": "E", 

1148 b"\xcb": "E", 

1149 b"\xcc": "I", 

1150 b"\xcd": "I", 

1151 b"\xce": "I", 

1152 b"\xcf": "I", 

1153 b"\xd0": "D", 

1154 b"\xd1": "N", 

1155 b"\xd2": "O", 

1156 b"\xd3": "O", 

1157 b"\xd4": "O", 

1158 b"\xd5": "O", 

1159 b"\xd6": "O", 

1160 b"\xd7": "*", 

1161 b"\xd8": "O", 

1162 b"\xd9": "U", 

1163 b"\xda": "U", 

1164 b"\xdb": "U", 

1165 b"\xdc": "U", 

1166 b"\xdd": "Y", 

1167 b"\xde": "b", 

1168 b"\xdf": "B", 

1169 b"\xe0": "a", 

1170 b"\xe1": "a", 

1171 b"\xe2": "a", 

1172 b"\xe3": "a", 

1173 b"\xe4": "a", 

1174 b"\xe5": "a", 

1175 b"\xe6": "ae", 

1176 b"\xe7": "c", 

1177 b"\xe8": "e", 

1178 b"\xe9": "e", 

1179 b"\xea": "e", 

1180 b"\xeb": "e", 

1181 b"\xec": "i", 

1182 b"\xed": "i", 

1183 b"\xee": "i", 

1184 b"\xef": "i", 

1185 b"\xf0": "o", 

1186 b"\xf1": "n", 

1187 b"\xf2": "o", 

1188 b"\xf3": "o", 

1189 b"\xf4": "o", 

1190 b"\xf5": "o", 

1191 b"\xf6": "o", 

1192 b"\xf7": "/", 

1193 b"\xf8": "o", 

1194 b"\xf9": "u", 

1195 b"\xfa": "u", 

1196 b"\xfb": "u", 

1197 b"\xfc": "u", 

1198 b"\xfd": "y", 

1199 b"\xfe": "b", 

1200 b"\xff": "y", 

1201 } 

1202 

1203 #: A map used when removing rogue Windows-1252/ISO-8859-1 

1204 #: characters in otherwise UTF-8 documents. Also used when a 

1205 #: numeric character entity has been incorrectly encoded using the 

1206 #: character's Windows-1252 encoding. 

1207 #: 

1208 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in 

1209 #: Windows-1252. 

1210 #: 

1211 #: :meta hide-value: 

1212 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = { 

1213 0x80: b"\xe2\x82\xac", # € 

1214 0x82: b"\xe2\x80\x9a", # ‚ 

1215 0x83: b"\xc6\x92", # ƒ 

1216 0x84: b"\xe2\x80\x9e", # „ 

1217 0x85: b"\xe2\x80\xa6", # … 

1218 0x86: b"\xe2\x80\xa0", # † 

1219 0x87: b"\xe2\x80\xa1", # ‡ 

1220 0x88: b"\xcb\x86", # ˆ 

1221 0x89: b"\xe2\x80\xb0", # ‰ 

1222 0x8A: b"\xc5\xa0", # Š 

1223 0x8B: b"\xe2\x80\xb9", # ‹ 

1224 0x8C: b"\xc5\x92", # Œ 

1225 0x8E: b"\xc5\xbd", # Ž 

1226 0x91: b"\xe2\x80\x98", # ‘ 

1227 0x92: b"\xe2\x80\x99", # ’ 

1228 0x93: b"\xe2\x80\x9c", # “ 

1229 0x94: b"\xe2\x80\x9d", # ” 

1230 0x95: b"\xe2\x80\xa2", # • 

1231 0x96: b"\xe2\x80\x93", # – 

1232 0x97: b"\xe2\x80\x94", # — 

1233 0x98: b"\xcb\x9c", # ˜ 

1234 0x99: b"\xe2\x84\xa2", # ™ 

1235 0x9A: b"\xc5\xa1", # š 

1236 0x9B: b"\xe2\x80\xba", # › 

1237 0x9C: b"\xc5\x93", # œ 

1238 0x9E: b"\xc5\xbe", # ž 

1239 0x9F: b"\xc5\xb8", # Ÿ 

1240 0xA0: b"\xc2\xa0", # 

1241 0xA1: b"\xc2\xa1", # ¡ 

1242 0xA2: b"\xc2\xa2", # ¢ 

1243 0xA3: b"\xc2\xa3", # £ 

1244 0xA4: b"\xc2\xa4", # ¤ 

1245 0xA5: b"\xc2\xa5", # ¥ 

1246 0xA6: b"\xc2\xa6", # ¦ 

1247 0xA7: b"\xc2\xa7", # § 

1248 0xA8: b"\xc2\xa8", # ¨ 

1249 0xA9: b"\xc2\xa9", # © 

1250 0xAA: b"\xc2\xaa", # ª 

1251 0xAB: b"\xc2\xab", # « 

1252 0xAC: b"\xc2\xac", # ¬ 

1253 0xAD: b"\xc2\xad", # ­ 

1254 0xAE: b"\xc2\xae", # ® 

1255 0xAF: b"\xc2\xaf", # ¯ 

1256 0xB0: b"\xc2\xb0", # ° 

1257 0xB1: b"\xc2\xb1", # ± 

1258 0xB2: b"\xc2\xb2", # ² 

1259 0xB3: b"\xc2\xb3", # ³ 

1260 0xB4: b"\xc2\xb4", # ´ 

1261 0xB5: b"\xc2\xb5", # µ 

1262 0xB6: b"\xc2\xb6", # ¶ 

1263 0xB7: b"\xc2\xb7", # · 

1264 0xB8: b"\xc2\xb8", # ¸ 

1265 0xB9: b"\xc2\xb9", # ¹ 

1266 0xBA: b"\xc2\xba", # º 

1267 0xBB: b"\xc2\xbb", # » 

1268 0xBC: b"\xc2\xbc", # ¼ 

1269 0xBD: b"\xc2\xbd", # ½ 

1270 0xBE: b"\xc2\xbe", # ¾ 

1271 0xBF: b"\xc2\xbf", # ¿ 

1272 0xC0: b"\xc3\x80", # À 

1273 0xC1: b"\xc3\x81", # Á 

1274 0xC2: b"\xc3\x82", #  

1275 0xC3: b"\xc3\x83", # à

1276 0xC4: b"\xc3\x84", # Ä 

1277 0xC5: b"\xc3\x85", # Å 

1278 0xC6: b"\xc3\x86", # Æ 

1279 0xC7: b"\xc3\x87", # Ç 

1280 0xC8: b"\xc3\x88", # È 

1281 0xC9: b"\xc3\x89", # É 

1282 0xCA: b"\xc3\x8a", # Ê 

1283 0xCB: b"\xc3\x8b", # Ë 

1284 0xCC: b"\xc3\x8c", # Ì 

1285 0xCD: b"\xc3\x8d", # Í 

1286 0xCE: b"\xc3\x8e", # Π

1287 0xCF: b"\xc3\x8f", # Ï 

1288 0xD0: b"\xc3\x90", # Р

1289 0xD1: b"\xc3\x91", # Ñ 

1290 0xD2: b"\xc3\x92", # Ò 

1291 0xD3: b"\xc3\x93", # Ó 

1292 0xD4: b"\xc3\x94", # Ô 

1293 0xD5: b"\xc3\x95", # Õ 

1294 0xD6: b"\xc3\x96", # Ö 

1295 0xD7: b"\xc3\x97", # × 

1296 0xD8: b"\xc3\x98", # Ø 

1297 0xD9: b"\xc3\x99", # Ù 

1298 0xDA: b"\xc3\x9a", # Ú 

1299 0xDB: b"\xc3\x9b", # Û 

1300 0xDC: b"\xc3\x9c", # Ü 

1301 0xDD: b"\xc3\x9d", # Ý 

1302 0xDE: b"\xc3\x9e", # Þ 

1303 0xDF: b"\xc3\x9f", # ß 

1304 0xE0: b"\xc3\xa0", # à 

1305 0xE1: b"\xa1", # á 

1306 0xE2: b"\xc3\xa2", # â 

1307 0xE3: b"\xc3\xa3", # ã 

1308 0xE4: b"\xc3\xa4", # ä 

1309 0xE5: b"\xc3\xa5", # å 

1310 0xE6: b"\xc3\xa6", # æ 

1311 0xE7: b"\xc3\xa7", # ç 

1312 0xE8: b"\xc3\xa8", # è 

1313 0xE9: b"\xc3\xa9", # é 

1314 0xEA: b"\xc3\xaa", # ê 

1315 0xEB: b"\xc3\xab", # ë 

1316 0xEC: b"\xc3\xac", # ì 

1317 0xED: b"\xc3\xad", # í 

1318 0xEE: b"\xc3\xae", # î 

1319 0xEF: b"\xc3\xaf", # ï 

1320 0xF0: b"\xc3\xb0", # ð 

1321 0xF1: b"\xc3\xb1", # ñ 

1322 0xF2: b"\xc3\xb2", # ò 

1323 0xF3: b"\xc3\xb3", # ó 

1324 0xF4: b"\xc3\xb4", # ô 

1325 0xF5: b"\xc3\xb5", # õ 

1326 0xF6: b"\xc3\xb6", # ö 

1327 0xF7: b"\xc3\xb7", # ÷ 

1328 0xF8: b"\xc3\xb8", # ø 

1329 0xF9: b"\xc3\xb9", # ù 

1330 0xFA: b"\xc3\xba", # ú 

1331 0xFB: b"\xc3\xbb", # û 

1332 0xFC: b"\xc3\xbc", # ü 

1333 0xFD: b"\xc3\xbd", # ý 

1334 0xFE: b"\xc3\xbe", # þ 

1335 0xFF: b"\xc3\xbf", # ÿ 

1336 } 

1337 

1338 #: :meta private 

1339 # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed. 

1340 # 

1341 # "A noncharacter is a code point that is in the range 

1342 # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE, 

1343 # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, 

1344 # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, 

1345 # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, 

1346 # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, 

1347 # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, 

1348 # or U+10FFFF." 

1349 ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff, 

1350 0x1fffe, 0x1ffff, 

1351 0x2fffe, 0x2ffff, 

1352 0x3fffe, 0x3ffff, 

1353 0x4fffe, 0x4ffff, 

1354 0x5fffe, 0x5ffff, 

1355 0x6fffe, 0x6ffff, 

1356 0x7fffe, 0x7ffff, 

1357 0x8fffe, 0x8ffff, 

1358 0x9fffe, 0x9ffff, 

1359 0xafffe, 0xaffff, 

1360 0xbfffe, 0xbffff, 

1361 0xcfffe, 0xcffff, 

1362 0xdfffe, 0xdffff, 

1363 0xefffe, 0xeffff, 

1364 0xffffe, 0xfffff, 

1365 0x10fffe, 0x10ffff]) 

1366 

1367 #: :meta private: 

1368 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [ 

1369 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF 

1370 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF 

1371 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4 

1372 ] 

1373 

1374 #: :meta private: 

1375 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0] 

1376 

1377 #: :meta private: 

1378 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 

1379 

1380 @classmethod 

1381 def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]: 

1382 """This (mostly) implements the algorithm described in "Numeric character 

1383 reference end state" from the HTML spec: 

1384 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state 

1385 

1386 The algorithm is designed to convert numeric character references like "&#9731;" 

1387 to Unicode characters like "☃". 

1388 

1389 :return: A 2-tuple (character, replaced). `character` is the Unicode 

1390 character corresponding to the numeric reference and `replaced` is 

1391 whether or not an unresolvable character was replaced with REPLACEMENT 

1392 CHARACTER. 

1393 """ 

1394 replacement = "\ufffd" 

1395 

1396 if numeric == 0x00: 

1397 # "If the number is 0x00, then this is a 

1398 # null-character-reference parse error. Set the character 

1399 # reference code to 0xFFFD." 

1400 return replacement, True 

1401 

1402 if numeric > 0x10ffff: 

1403 # "If the number is greater than 0x10FFFF, then this is a 

1404 # character-reference-outside-unicode-range parse 

1405 # error. Set the character reference code to 0xFFFD." 

1406 return replacement, True 

1407 

1408 if numeric >= 0xd800 and numeric <= 0xdfff: 

1409 # "If the number is a surrogate, then this is a 

1410 # surrogate-character-reference parse error. Set the 

1411 # character reference code to 0xFFFD." 

1412 return replacement, True 

1413 

1414 if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS: 

1415 # "If the number is a noncharacter, then this is a 

1416 # noncharacter-character-reference parse error." 

1417 # 

1418 # "The parser resolves such character references as-is." 

1419 # 

1420 # I'm not sure what "as-is" means but I think it means that we act 

1421 # like there was no error condition. 

1422 return chr(numeric), False 

1423 

1424 # "If the number is 0x0D, or a control that's not ASCII whitespace, 

1425 # then this is a control-character-reference parse error." 

1426 # 

1427 # "A control is a C0 control or a code point in the range 

1428 # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND, 

1429 # inclusive." 

1430 # 

1431 # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive." 

1432 # 

1433 # "The parser resolves such character references as-is except C1 control references that are replaced." 

1434 

1435 # First, let's replace the control references that can be replaced. 

1436 if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8: 

1437 # "If the number is one of the numbers in the first column of the 

1438 # following table, then find the row with that number in the first 

1439 # column, and set the character reference code to the number in the 

1440 # second column of that row." 

1441 # 

1442 # This is an attempt to catch characters that were encoded to numeric 

1443 # entities using their Windows-1252 encodings rather than their UTF-8 

1444 # encodings. 

1445 return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False 

1446 

1447 # Now all that's left are references that should be resolved as-is. This 

1448 # is also the default path for non-weird character references. 

1449 try: 

1450 return chr(numeric), False 

1451 except (ValueError, OverflowError): 

1452 # This shouldn't happen, since these cases should have been handled 

1453 # above, but if it does, return REPLACEMENT CHARACTER 

1454 return replacement, True 

1455 

1456 @classmethod 

1457 def detwingle( 

1458 cls, 

1459 in_bytes: bytes, 

1460 main_encoding: _Encoding = "utf8", 

1461 embedded_encoding: _Encoding = "windows-1252", 

1462 ) -> bytes: 

1463 """Fix characters from one encoding embedded in some other encoding. 

1464 

1465 Currently the only situation supported is Windows-1252 (or its 

1466 subset ISO-8859-1), embedded in UTF-8. 

1467 

1468 :param in_bytes: A bytestring that you suspect contains 

1469 characters from multiple encodings. Note that this *must* 

1470 be a bytestring. If you've already converted the document 

1471 to Unicode, you're too late. 

1472 :param main_encoding: The primary encoding of ``in_bytes``. 

1473 :param embedded_encoding: The encoding that was used to embed characters 

1474 in the main document. 

1475 :return: A bytestring similar to ``in_bytes``, in which 

1476 ``embedded_encoding`` characters have been converted to 

1477 their ``main_encoding`` equivalents. 

1478 """ 

1479 if embedded_encoding.replace("_", "-").lower() not in ( 

1480 "windows-1252", 

1481 "windows_1252", 

1482 ): 

1483 raise NotImplementedError( 

1484 "Windows-1252 and ISO-8859-1 are the only currently supported " 

1485 "embedded encodings." 

1486 ) 

1487 

1488 if main_encoding.lower() not in ("utf8", "utf-8"): 

1489 raise NotImplementedError( 

1490 "UTF-8 is the only currently supported main encoding." 

1491 ) 

1492 

1493 byte_chunks = [] 

1494 

1495 chunk_start = 0 

1496 pos = 0 

1497 while pos < len(in_bytes): 

1498 byte = in_bytes[pos] 

1499 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER: 

1500 # This is the start of a UTF-8 multibyte character. Skip 

1501 # to the end. 

1502 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 

1503 if byte >= start and byte <= end: 

1504 pos += size 

1505 break 

1506 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 

1507 # We found a Windows-1252 character! 

1508 # Save the string up to this point as a chunk. 

1509 byte_chunks.append(in_bytes[chunk_start:pos]) 

1510 

1511 # Now translate the Windows-1252 character into UTF-8 

1512 # and add it as another, one-byte chunk. 

1513 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 

1514 pos += 1 

1515 chunk_start = pos 

1516 else: 

1517 # Go on to the next character. 

1518 pos += 1 

1519 if chunk_start == 0: 

1520 # The string is unchanged. 

1521 return in_bytes 

1522 else: 

1523 # Store the final chunk. 

1524 byte_chunks.append(in_bytes[chunk_start:]) 

1525 return b"".join(byte_chunks)