Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 42%

1# -*- coding: utf-8 -*-

2"""Beautiful Soup bonus library: Unicode, Dammit

4This library converts a bytestream to Unicode through any means

5necessary. It is heavily based on code from Mark Pilgrim's `Universal

6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained

7by Kurt McKee. It does not rewrite the body of an XML or HTML document

8to reflect a new encoding; that's the job of `TreeBuilder`.

10"""

12# Use of this source code is governed by the MIT license.

13__license__ = "MIT"

15from html.entities import codepoint2name

16from collections import defaultdict

17import codecs

18from html.entities import html5

19import re

20from logging import Logger, getLogger

21from types import ModuleType

22from typing import (

23 Dict,

24 Iterator,

25 List,

26 Optional,

27 Pattern,

28 Set,

29 Tuple,

30 Type,

31 Union,

32 cast,

33)

34from typing_extensions import Literal

35from bs4._typing import (

36 _Encoding,

37 _Encodings,

38)

39import warnings

41# Import a library to autodetect character encodings. We'll support

42# any of a number of libraries that all support the same API:

43#

44# * cchardet

45# * chardet

46# * charset-normalizer

47chardet_module: Optional[ModuleType] = None

48try:

49 # PyPI package: cchardet

50 import cchardet # type:ignore

52 chardet_module = cchardet

53except ImportError:

54 try:

55 # Debian package: python-chardet

56 # PyPI package: chardet

57 import chardet

59 chardet_module = chardet

60 except ImportError:

61 try:

62 # PyPI package: charset-normalizer

63 import charset_normalizer # type:ignore

65 chardet_module = charset_normalizer

66 except ImportError:

67 # No chardet available.

68 pass

71def _chardet_dammit(s: bytes) -> Optional[str]:

72 """Try as hard as possible to detect the encoding of a bytestring."""

73 if chardet_module is None or isinstance(s, str):

74 return None

75 module = chardet_module

76 return module.detect(s)["encoding"]

79# Build bytestring and Unicode versions of regular expressions for finding

80# a declared encoding inside an XML or HTML document.

81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:

82html_meta: str = (

83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:

84)

86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.

87encoding_res: Dict[Type, Dict[str, Pattern]] = dict()

88encoding_res[bytes] = {

89 "html": re.compile(html_meta.encode("ascii"), re.I),

90 "xml": re.compile(xml_encoding.encode("ascii"), re.I),

91}

92encoding_res[str] = {

93 "html": re.compile(html_meta, re.I),

94 "xml": re.compile(xml_encoding, re.I),

95}

98class EntitySubstitutionMeta(type):

99 """Provides lazy access to some data structures and regular

100 expressions used by EntitySubstitution which have a measurable

101 startup cost.

102 """

103 # Trigger for

104 _CLASS_VARIABLES_POPULATED: bool = False

105

106 @property

107 def HTML_ENTITY_TO_CHARACTER(self) -> Dict[str, str]:

108 """A mapping of entity names like "angmsdaa" to Unicode

109 strings like "⦨".

110 """

111 if not self._CLASS_VARIABLES_POPULATED:

112 self._populate_class_variables()

113 return self._HTML_ENTITY_TO_CHARACTER

114 _HTML_ENTITY_TO_CHARACTER: Dict[str, str]

115

116 @property

117 def CHARACTER_TO_HTML_ENTITY(self) -> Dict[str, str]:

118 """A mapping of Unicode strings like "⦨" to entity names like

119 "angmsdaa". When a single Unicode string has multiple entity

120 names, we try to choose the most commonly-used name.

121 """

122 if not self._CLASS_VARIABLES_POPULATED:

123 self._populate_class_variables()

124 return self._CHARACTER_TO_HTML_ENTITY

125 _CHARACTER_TO_HTML_ENTITY: Dict[str, str]

126

127 @property

128 def CHARACTER_TO_HTML_ENTITY_RE(self) -> Pattern[str]:

129 """A regular expression matching (almost) any Unicode string

130 that corresponds to an HTML5 named entity.

131 """

132

133 if not self._CLASS_VARIABLES_POPULATED:

134 self._populate_class_variables()

135 return self._CHARACTER_TO_HTML_ENTITY_RE

136 _CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]

137

138 @property

139 def CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE(self) -> Pattern[str]:

140 """A very similar regular expression to

141 CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped

142 ampersands. This is used by the 'html' formatter to provide

143 backwards-compatibility, even though the HTML5 spec allows

144 most ampersands to go unescaped.

145 """

146 if not self._CLASS_VARIABLES_POPULATED:

147 self._populate_class_variables()

148 return self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE

149 _CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]

150

151 def _populate_class_variables(self) -> None:

152 """Initialize variables used by EntitySubstitution to manage the plethora of

153 HTML and HTML5 named entities.

154

155 This method populates the class variables necessary to make

156 the properties defined in the metaclass work.

157 """

158 if self._CLASS_VARIABLES_POPULATED:

159 return

160 unicode_to_name = {}

161 name_to_unicode = {}

162

163 short_entities = set()

164 long_entities_by_first_character = defaultdict(set)

165

166 for name_with_semicolon, character in sorted(html5.items()):

167 # "It is intentional, for legacy compatibility, that many

168 # code points have multiple character reference names. For

169 # example, some appear both with and without the trailing

170 # semicolon, or with different capitalizations."

171 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references

172 #

173 # The parsers are in charge of handling (or not) character

174 # references with no trailing semicolon, so we remove the

175 # semicolon whenever it appears.

176 if name_with_semicolon.endswith(";"):

177 name = name_with_semicolon[:-1]

178 else:

179 name = name_with_semicolon

180

181 # When parsing HTML, we want to recognize any known named

182 # entity and convert it to a sequence of Unicode

183 # characters.

184 if name not in name_to_unicode:

185 name_to_unicode[name] = character

186

187 # When _generating_ HTML, we want to recognize special

188 # character sequences that _could_ be converted to named

189 # entities.

190 unicode_to_name[character] = name

191

192 # We also need to build a regular expression that lets us

193 # _find_ those characters in output strings so we can

194 # replace them.

195 #

196 # This is tricky, for two reasons.

197

198 if len(character) == 1 and ord(character) < 128 and character not in "<>":

199 # First, it would be annoying to turn single ASCII

200 # characters like | into named entities like

201 # |. The exceptions are <>, which we _must_

202 # turn into named entities to produce valid HTML.

203 continue

204

205 if len(character) > 1 and all(ord(x) < 128 for x in character):

206 # We also do not want to turn _combinations_ of ASCII

207 # characters like 'fj' into named entities like '&fjlig;',

208 # though that's more debateable.

209 continue

210

211 # Second, some named entities have a Unicode value that's

212 # a subset of the Unicode value for some _other_ named

213 # entity. As an example, \u2267' is &GreaterFullEqual;,

214 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular

215 # expression needs to match the first two characters of

216 # "\u2267\u0338foo", but only the first character of

217 # "\u2267foo".

218 #

219 # In this step, we build two sets of characters that

220 # _eventually_ need to go into the regular expression. But

221 # we won't know exactly what the regular expression needs

222 # to look like until we've gone through the entire list of

223 # named entities.

224 if len(character) == 1 and character != "&":

225 short_entities.add(character)

226 else:

227 long_entities_by_first_character[character[0]].add(character)

228

229 # Now that we've been through the entire list of entities, we

230 # can create a regular expression that matches any of them.

231 particles = set()

232 for short in short_entities:

233 long_versions = long_entities_by_first_character[short]

234 if not long_versions:

235 particles.add(short)

236 else:

237 ignore = "".join([x[1] for x in long_versions])

238 # This finds, e.g. \u2267 but only if it is _not_

239 # followed by \u0338.

240 particles.add("%s(?![%s])" % (short, ignore))

241

242 for long_entities in list(long_entities_by_first_character.values()):

243 for long_entity in long_entities:

244 particles.add(long_entity)

245

246 re_definition = "(%s)" % "|".join(particles)

247

248 particles.add("&")

249 re_definition_with_ampersand = "(%s)" % "|".join(particles)

250

251 # If an entity shows up in both html5 and codepoint2name, it's

252 # likely that HTML5 gives it several different names, such as

253 # 'rsquo' and 'rsquor'. When converting Unicode characters to

254 # named entities, the codepoint2name name should take

255 # precedence where possible, since that's the more easily

256 # recognizable one.

257 for codepoint, name in list(codepoint2name.items()):

258 character = chr(codepoint)

259 unicode_to_name[character] = name

260

261 self._CHARACTER_TO_HTML_ENTITY = unicode_to_name

262 self._HTML_ENTITY_TO_CHARACTER = name_to_unicode

263 self._CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)

264 self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(

265 re_definition_with_ampersand

266 )

267 self._CLASS_VARIABLES_POPULATED = True

268

269class EntitySubstitution(metaclass=EntitySubstitutionMeta):

270 """The ability to substitute XML or HTML entities for certain characters."""

271

272 #: A map of Unicode strings to the corresponding named XML entities.

273 #:

274 #: :meta hide-value:

275 CHARACTER_TO_XML_ENTITY: Dict[str, str] = {

276 "'": "apos",

277 '"': "quot",

278 "&": "amp",

279 "<": "lt",

280 ">": "gt",

281 }

282

283 # Matches any named or numeric HTML entity.

284 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)

285

286 #: A regular expression matching an angle bracket or an ampersand that

287 #: is not part of an XML or HTML entity.

288 #:

289 #: :meta hide-value:

290 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(

291 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"

292 )

293

294 #: A regular expression matching an angle bracket or an ampersand.

295 #:

296 #: :meta hide-value:

297 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")

298

299 @classmethod

300 def _substitute_html_entity(cls, matchobj: re.Match) -> str:

301 """Used with a regular expression to substitute the

302 appropriate HTML entity for a special character string."""

303 original_entity = matchobj.group(0)

304 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)

305 if entity is None:

306 return "&%s;" % original_entity

307 return "&%s;" % entity

308

309 @classmethod

310 def _substitute_xml_entity(cls, matchobj: re.Match) -> str:

311 """Used with a regular expression to substitute the

312 appropriate XML entity for a special character string."""

313 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]

314 return "&%s;" % entity

315

316 @classmethod

317 def _escape_entity_name(cls, matchobj: re.Match) -> str:

318 return "&%s;" % matchobj.group(1)

319

320 @classmethod

321 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:

322 possible_entity = matchobj.group(1)

323 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:

324 return "&%s;" % possible_entity

325 return "&%s;" % possible_entity

326

327 @classmethod

328 def quoted_attribute_value(cls, value: str) -> str:

329 """Make a value into a quoted XML attribute, possibly escaping it.

330

331 Most strings will be quoted using double quotes.

332

333 Bob's Bar -> "Bob's Bar"

334

335 If a string contains double quotes, it will be quoted using

336 single quotes.

337

338 Welcome to "my bar" -> 'Welcome to "my bar"'

339

340 If a string contains both single and double quotes, the

341 double quotes will be escaped, and the string will be quoted

342 using double quotes.

343

344 Welcome to "Bob's Bar" -> Welcome to "Bob's bar"

345

346 :param value: The XML attribute value to quote

347 :return: The quoted value

348 """

349 quote_with = '"'

350 if '"' in value:

351 if "'" in value:

352 # The string contains both single and double

353 # quotes. Turn the double quotes into

354 # entities. We quote the double quotes rather than

355 # the single quotes because the entity name is

356 # """ whether this is HTML or XML. If we

357 # quoted the single quotes, we'd have to decide

358 # between ' and &squot;.

359 replace_with = """

360 value = value.replace('"', replace_with)

361 else:

362 # There are double quotes but no single quotes.

363 # We can use single quotes to quote the attribute.

364 quote_with = "'"

365 return quote_with + value + quote_with

366

367 @classmethod

368 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:

369 """Replace special XML characters with named XML entities.

370

371 The less-than sign will become <, the greater-than sign

372 will become >, and any ampersands will become &. If you

373 want ampersands that seem to be part of an entity definition

374 to be left alone, use `substitute_xml_containing_entities`

375 instead.

376

377 :param value: A string to be substituted.

378

379 :param make_quoted_attribute: If True, then the string will be

380 quoted, as befits an attribute value.

381

382 :return: A version of ``value`` with special characters replaced

383 with named entities.

384 """

385 # Escape angle brackets and ampersands.

386 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)

387

388 if make_quoted_attribute:

389 value = cls.quoted_attribute_value(value)

390 return value

391

392 @classmethod

393 def substitute_xml_containing_entities(

394 cls, value: str, make_quoted_attribute: bool = False

395 ) -> str:

396 """Substitute XML entities for special XML characters.

397

398 :param value: A string to be substituted. The less-than sign will

399 become <, the greater-than sign will become >, and any

400 ampersands that are not part of an entity defition will

401 become &.

402

403 :param make_quoted_attribute: If True, then the string will be

404 quoted, as befits an attribute value.

405 """

406 # Escape angle brackets, and ampersands that aren't part of

407 # entities.

408 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)

409

410 if make_quoted_attribute:

411 value = cls.quoted_attribute_value(value)

412 return value

413

414 @classmethod

415 def substitute_html(cls, s: str) -> str:

416 """Replace certain Unicode characters with named HTML entities.

417

418 This differs from ``data.encode(encoding, 'xmlcharrefreplace')``

419 in that the goal is to make the result more readable (to those

420 with ASCII displays) rather than to recover from

421 errors. There's absolutely nothing wrong with a UTF-8 string

422 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that

423 character with "é" will make it more readable to some

424 people.

425

426 :param s: The string to be modified.

427 :return: The string with some Unicode characters replaced with

428 HTML entities.

429 """

430 # Convert any appropriate characters to HTML entities.

431 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(

432 cls._substitute_html_entity, s

433 )

434

435 @classmethod

436 def substitute_html5(cls, s: str) -> str:

437 """Replace certain Unicode characters with named HTML entities

438 using HTML5 rules.

439

440 Specifically, this method is much less aggressive about

441 escaping ampersands than substitute_html. Only ambiguous

442 ampersands are escaped, per the HTML5 standard:

443

444 "An ambiguous ampersand is a U+0026 AMPERSAND character (&)

445 that is followed by one or more ASCII alphanumerics, followed

446 by a U+003B SEMICOLON character (;), where these characters do

447 not match any of the names given in the named character

448 references section."

449

450 Unlike substitute_html5_raw, this method assumes HTML entities

451 were converted to Unicode characters on the way in, as

452 Beautiful Soup does. By the time Beautiful Soup does its work,

453 the only ambiguous ampersands that need to be escaped are the

454 ones that were escaped in the original markup when mentioning

455 HTML entities.

456

457 :param s: The string to be modified.

458 :return: The string with some Unicode characters replaced with

459 HTML entities.

460 """

461 # First, escape any HTML entities found in the markup.

462 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)

463

464 # Next, convert any appropriate characters to unescaped HTML entities.

465 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)

466

467 return s

468

469 @classmethod

470 def substitute_html5_raw(cls, s: str) -> str:

471 """Replace certain Unicode characters with named HTML entities

472 using HTML5 rules.

473

474 substitute_html5_raw is similar to substitute_html5 but it is

475 designed for standalone use (whereas substitute_html5 is

476 designed for use with Beautiful Soup).

477

478 :param s: The string to be modified.

479 :return: The string with some Unicode characters replaced with

480 HTML entities.

481 """

482 # First, escape the ampersand for anything that looks like an

483 # entity but isn't in the list of recognized entities. All other

484 # ampersands can be left alone.

485 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)

486

487 # Then, convert a range of Unicode characters to unescaped

488 # HTML entities.

489 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)

490

491 return s

492

493

494class EncodingDetector:

495 """This class is capable of guessing a number of possible encodings

496 for a bytestring.

497

498 Order of precedence:

499

500 1. Encodings you specifically tell EncodingDetector to try first

501 (the ``known_definite_encodings`` argument to the constructor).

502

503 2. An encoding determined by sniffing the document's byte-order mark.

504

505 3. Encodings you specifically tell EncodingDetector to try if

506 byte-order mark sniffing fails (the ``user_encodings`` argument to the

507 constructor).

508

509 4. An encoding declared within the bytestring itself, either in an

510 XML declaration (if the bytestring is to be interpreted as an XML

511 document), or in a <meta> tag (if the bytestring is to be

512 interpreted as an HTML document.)

513

514 5. An encoding detected through textual analysis by chardet,

515 cchardet, or a similar external library.

516

517 6. UTF-8.

518

519 7. Windows-1252.

520

521 :param markup: Some markup in an unknown encoding.

522

523 :param known_definite_encodings: When determining the encoding

524 of ``markup``, these encodings will be tried first, in

525 order. In HTML terms, this corresponds to the "known

526 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

527

528 :param user_encodings: These encodings will be tried after the

529 ``known_definite_encodings`` have been tried and failed, and

530 after an attempt to sniff the encoding by looking at a

531 byte order mark has failed. In HTML terms, this

532 corresponds to the step "user has explicitly instructed

533 the user agent to override the document's character

534 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

535

536 :param override_encodings: A **deprecated** alias for

537 ``known_definite_encodings``. Any encodings here will be tried

538 immediately after the encodings in

539 ``known_definite_encodings``.

540

541 :param is_html: If True, this markup is considered to be

542 HTML. Otherwise it's assumed to be XML.

543

544 :param exclude_encodings: These encodings will not be tried,

545 even if they otherwise would be.

546

547 """

548

549 def __init__(

550 self,

551 markup: bytes,

552 known_definite_encodings: Optional[_Encodings] = None,

553 is_html: Optional[bool] = False,

554 exclude_encodings: Optional[_Encodings] = None,

555 user_encodings: Optional[_Encodings] = None,

556 override_encodings: Optional[_Encodings] = None,

557 ):

558 self.known_definite_encodings = list(known_definite_encodings or [])

559 if override_encodings:

560 warnings.warn(

561 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",

562 DeprecationWarning,

563 stacklevel=3,

564 )

565 self.known_definite_encodings += override_encodings

566 self.user_encodings = user_encodings or []

567 exclude_encodings = exclude_encodings or []

568 self.exclude_encodings = set([x.lower() for x in exclude_encodings])

569 self.chardet_encoding = None

570 self.is_html = False if is_html is None else is_html

571 self.declared_encoding: Optional[str] = None

572

573 # First order of business: strip a byte-order mark.

574 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)

575

576 known_definite_encodings: _Encodings

577 user_encodings: _Encodings

578 exclude_encodings: _Encodings

579 chardet_encoding: Optional[_Encoding]

580 is_html: bool

581 declared_encoding: Optional[_Encoding]

582 markup: bytes

583 sniffed_encoding: Optional[_Encoding]

584

585 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:

586 """Should we even bother to try this encoding?

587

588 :param encoding: Name of an encoding.

589 :param tried: Encodings that have already been tried. This

590 will be modified as a side effect.

591 """

592 if encoding is None:

593 return False

594 encoding = encoding.lower()

595 if encoding in self.exclude_encodings:

596 return False

597 if encoding not in tried:

598 tried.add(encoding)

599 return True

600 return False

601

602 @property

603 def encodings(self) -> Iterator[_Encoding]:

604 """Yield a number of encodings that might work for this markup.

605

606 :yield: A sequence of strings. Each is the name of an encoding

607 that *might* work to convert a bytestring into Unicode.

608 """

609 tried: Set[_Encoding] = set()

610

611 # First, try the known definite encodings

612 for e in self.known_definite_encodings:

613 if self._usable(e, tried):

614 yield e

615

616 # Did the document originally start with a byte-order mark

617 # that indicated its encoding?

618 if self.sniffed_encoding is not None and self._usable(

619 self.sniffed_encoding, tried

620 ):

621 yield self.sniffed_encoding

622

623 # Sniffing the byte-order mark did nothing; try the user

624 # encodings.

625 for e in self.user_encodings:

626 if self._usable(e, tried):

627 yield e

628

629 # Look within the document for an XML or HTML encoding

630 # declaration.

631 if self.declared_encoding is None:

632 self.declared_encoding = self.find_declared_encoding(

633 self.markup, self.is_html

634 )

635 if self.declared_encoding is not None and self._usable(

636 self.declared_encoding, tried

637 ):

638 yield self.declared_encoding

639

640 # Use third-party character set detection to guess at the

641 # encoding.

642 if self.chardet_encoding is None:

643 self.chardet_encoding = _chardet_dammit(self.markup)

644 if self.chardet_encoding is not None and self._usable(

645 self.chardet_encoding, tried

646 ):

647 yield self.chardet_encoding

648

649 # As a last-ditch effort, try utf-8 and windows-1252.

650 for e in ("utf-8", "windows-1252"):

651 if self._usable(e, tried):

652 yield e

653

654 @classmethod

655 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:

656 """If a byte-order mark is present, strip it and return the encoding it implies.

657

658 :param data: A bytestring that may or may not begin with a

659 byte-order mark.

660

661 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)

662 """

663 encoding = None

664 if isinstance(data, str):

665 # Unicode data cannot have a byte-order mark.

666 return data, encoding

667 if (

668 (len(data) >= 4)

669 and (data[:2] == b"\xfe\xff")

670 and (data[2:4] != b"\x00\x00")

671 ):

672 encoding = "utf-16be"

673 data = data[2:]

674 elif (

675 (len(data) >= 4)

676 and (data[:2] == b"\xff\xfe")

677 and (data[2:4] != b"\x00\x00")

678 ):

679 encoding = "utf-16le"

680 data = data[2:]

681 elif data[:3] == b"\xef\xbb\xbf":

682 encoding = "utf-8"

683 data = data[3:]

684 elif data[:4] == b"\x00\x00\xfe\xff":

685 encoding = "utf-32be"

686 data = data[4:]

687 elif data[:4] == b"\xff\xfe\x00\x00":

688 encoding = "utf-32le"

689 data = data[4:]

690 return data, encoding

691

692 @classmethod

693 def find_declared_encoding(

694 cls,

695 markup: Union[bytes, str],

696 is_html: bool = False,

697 search_entire_document: bool = False,

698 ) -> Optional[_Encoding]:

699 """Given a document, tries to find an encoding declared within the

700 text of the document itself.

701

702 An XML encoding is declared at the beginning of the document.

703

704 An HTML encoding is declared in a <meta> tag, hopefully near the

705 beginning of the document.

706

707 :param markup: Some markup.

708 :param is_html: If True, this markup is considered to be HTML. Otherwise

709 it's assumed to be XML.

710 :param search_entire_document: Since an encoding is supposed

711 to declared near the beginning of the document, most of

712 the time it's only necessary to search a few kilobytes of

713 data. Set this to True to force this method to search the

714 entire document.

715 :return: The declared encoding, if one is found.

716 """

717 if search_entire_document:

718 xml_endpos = html_endpos = len(markup)

719 else:

720 xml_endpos = 1024

721 html_endpos = max(2048, int(len(markup) * 0.05))

722

723 if isinstance(markup, bytes):

724 res = encoding_res[bytes]

725 else:

726 res = encoding_res[str]

727

728 xml_re = res["xml"]

729 html_re = res["html"]

730 declared_encoding: Optional[_Encoding] = None

731 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)

732 if not declared_encoding_match and is_html:

733 declared_encoding_match = html_re.search(markup, endpos=html_endpos)

734 if declared_encoding_match is not None:

735 declared_encoding = declared_encoding_match.groups()[0]

736 if declared_encoding:

737 if isinstance(declared_encoding, bytes):

738 declared_encoding = declared_encoding.decode("ascii", "replace")

739 return declared_encoding.lower()

740 return None

741

742

743class UnicodeDammit:

744 """A class for detecting the encoding of a bytestring containing an

745 HTML or XML document, and decoding it to Unicode. If the source

746 encoding is windows-1252, `UnicodeDammit` can also replace

747 Microsoft smart quotes with their HTML or XML equivalents.

748

749 :param markup: HTML or XML markup in an unknown encoding.

750

751 :param known_definite_encodings: When determining the encoding

752 of ``markup``, these encodings will be tried first, in

753 order. In HTML terms, this corresponds to the "known

754 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

755

756 :param user_encodings: These encodings will be tried after the

757 ``known_definite_encodings`` have been tried and failed, and

758 after an attempt to sniff the encoding by looking at a

759 byte order mark has failed. In HTML terms, this

760 corresponds to the step "user has explicitly instructed

761 the user agent to override the document's character

762 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

763

764 :param override_encodings: A **deprecated** alias for

765 ``known_definite_encodings``. Any encodings here will be tried

766 immediately after the encodings in

767 ``known_definite_encodings``.

768

769 :param smart_quotes_to: By default, Microsoft smart quotes will,

770 like all other characters, be converted to Unicode

771 characters. Setting this to ``ascii`` will convert them to ASCII

772 quotes instead. Setting it to ``xml`` will convert them to XML

773 entity references, and setting it to ``html`` will convert them

774 to HTML entity references.

775

776 :param is_html: If True, ``markup`` is treated as an HTML

777 document. Otherwise it's treated as an XML document.

778

779 :param exclude_encodings: These encodings will not be considered,

780 even if the sniffing code thinks they might make sense.

781

782 """

783

784 def __init__(

785 self,

786 markup: bytes,

787 known_definite_encodings: Optional[_Encodings] = [],

788 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,

789 is_html: bool = False,

790 exclude_encodings: Optional[_Encodings] = [],

791 user_encodings: Optional[_Encodings] = None,

792 override_encodings: Optional[_Encodings] = None,

793 ):

794 self.smart_quotes_to = smart_quotes_to

795 self.tried_encodings = []

796 self.contains_replacement_characters = False

797 self.is_html = is_html

798 self.log = getLogger(__name__)

799 self.detector = EncodingDetector(

800 markup,

801 known_definite_encodings,

802 is_html,

803 exclude_encodings,

804 user_encodings,

805 override_encodings,

806 )

807

808 # Short-circuit if the data is in Unicode to begin with.

809 if isinstance(markup, str):

810 self.markup = markup.encode("utf8")

811 self.unicode_markup = markup

812 self.original_encoding = None

813 return

814

815 # The encoding detector may have stripped a byte-order mark.

816 # Use the stripped markup from this point on.

817 self.markup = self.detector.markup

818

819 u = None

820 for encoding in self.detector.encodings:

821 markup = self.detector.markup

822 u = self._convert_from(encoding)

823 if u is not None:

824 break

825

826 if not u:

827 # None of the encodings worked. As an absolute last resort,

828 # try them again with character replacement.

829

830 for encoding in self.detector.encodings:

831 if encoding != "ascii":

832 u = self._convert_from(encoding, "replace")

833 if u is not None:

834 self.log.warning(

835 "Some characters could not be decoded, and were "

836 "replaced with REPLACEMENT CHARACTER."

837 )

838

839 self.contains_replacement_characters = True

840 break

841

842 # If none of that worked, we could at this point force it to

843 # ASCII, but that would destroy so much data that I think

844 # giving up is better.

845 #

846 # Note that this is extremely unlikely, probably impossible,

847 # because the "replace" strategy is so powerful. Even running

848 # the Python binary through Unicode, Dammit gives you Unicode,

849 # albeit Unicode riddled with REPLACEMENT CHARACTER.

850 if u is None:

851 self.original_encoding = None

852 self.unicode_markup = None

853 else:

854 self.unicode_markup = u

855

856 #: The original markup, before it was converted to Unicode.

857 #: This is not necessarily the same as what was passed in to the

858 #: constructor, since any byte-order mark will be stripped.

859 markup: bytes

860

861 #: The Unicode version of the markup, following conversion. This

862 #: is set to None if there was simply no way to convert the

863 #: bytestring to Unicode (as with binary data).

864 unicode_markup: Optional[str]

865

866 #: This is True if `UnicodeDammit.unicode_markup` contains

867 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present

868 #: in `UnicodeDammit.markup`. These mark character sequences that

869 #: could not be represented in Unicode.

870 contains_replacement_characters: bool

871

872 #: Unicode, Dammit's best guess as to the original character

873 #: encoding of `UnicodeDammit.markup`.

874 original_encoding: Optional[_Encoding]

875

876 #: The strategy used to handle Microsoft smart quotes.

877 smart_quotes_to: Optional[str]

878

879 #: The (encoding, error handling strategy) 2-tuples that were used to

880 #: try and convert the markup to Unicode.

881 tried_encodings: List[Tuple[_Encoding, str]]

882

883 log: Logger #: :meta private:

884

885 def _sub_ms_char(self, match: re.Match) -> bytes:

886 """Changes a MS smart quote character to an XML or HTML

887 entity, or an ASCII character.

888

889 TODO: Since this is only used to convert smart quotes, it

890 could be simplified, and MS_CHARS_TO_ASCII made much less

891 parochial.

892 """

893 orig: bytes = match.group(1)

894 sub: bytes

895 if self.smart_quotes_to == "ascii":

896 if orig in self.MS_CHARS_TO_ASCII:

897 sub = self.MS_CHARS_TO_ASCII[orig].encode()

898 else:

899 # Shouldn't happen; substitute the character

900 # with itself.

901 sub = orig

902 else:

903 if orig in self.MS_CHARS:

904 substitutions = self.MS_CHARS[orig]

905 if type(substitutions) is tuple:

906 if self.smart_quotes_to == "xml":

907 sub = b"&#x" + substitutions[1].encode() + b";"

908 else:

909 sub = b"&" + substitutions[0].encode() + b";"

910 else:

911 substitutions = cast(str, substitutions)

912 sub = substitutions.encode()

913 else:

914 # Shouldn't happen; substitute the character

915 # for itself.

916 sub = orig

917 return sub

918

919 #: This dictionary maps commonly seen values for "charset" in HTML

920 #: meta tags to the corresponding Python codec names. It only covers

921 #: values that aren't in Python's aliases and can't be determined

922 #: by the heuristics in `find_codec`.

923 #:

924 #: :meta hide-value:

925 CHARSET_ALIASES: Dict[str, _Encoding] = {

926 "macintosh": "mac-roman",

927 "x-sjis": "shift-jis",

928 }

929

930 #: A list of encodings that tend to contain Microsoft smart quotes.

931 #:

932 #: :meta hide-value:

933 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [

934 "windows-1252",

935 "iso-8859-1",

936 "iso-8859-2",

937 ]

938

939 def _convert_from(

940 self, proposed: _Encoding, errors: str = "strict"

941 ) -> Optional[str]:

942 """Attempt to convert the markup to the proposed encoding.

943

944 :param proposed: The name of a character encoding.

945 :param errors: An error handling strategy, used when calling `str`.

946 :return: The converted markup, or `None` if the proposed

947 encoding/error handling strategy didn't work.

948 """

949 lookup_result = self.find_codec(proposed)

950 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:

951 return None

952 proposed = lookup_result

953 self.tried_encodings.append((proposed, errors))

954 markup = self.markup

955 # Convert smart quotes to HTML if coming from an encoding

956 # that might have them.

957 if (

958 self.smart_quotes_to is not None

959 and proposed in self.ENCODINGS_WITH_SMART_QUOTES

960 ):

961 smart_quotes_re = b"([\x80-\x9f])"

962 smart_quotes_compiled = re.compile(smart_quotes_re)

963 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)

964

965 try:

966 # print("Trying to convert document to %s (errors=%s)" % (

967 # proposed, errors))

968 u = self._to_unicode(markup, proposed, errors)

969 self.unicode_markup = u

970 self.original_encoding = proposed

971 except Exception:

972 # print("That didn't work!")

973 # print(e)

974 return None

975 # print("Correct encoding: %s" % proposed)

976 return self.unicode_markup

977

978 def _to_unicode(

979 self, data: bytes, encoding: _Encoding, errors: str = "strict"

980 ) -> str:

981 """Given a bytestring and its encoding, decodes the string into Unicode.

982

983 :param encoding: The name of an encoding.

984 :param errors: An error handling strategy, used when calling `str`.

985 """

986 return str(data, encoding, errors)

987

988 @property

989 def declared_html_encoding(self) -> Optional[_Encoding]:

990 """If the markup is an HTML document, returns the encoding, if any,

991 declared *inside* the document.

992 """

993 if not self.is_html:

994 return None

995 return self.detector.declared_encoding

996

997 def find_codec(self, charset: _Encoding) -> Optional[str]:

998 """Look up the Python codec corresponding to a given character set.

999

1000 :param charset: The name of a character set.

1001 :return: The name of a Python codec.

1002 """

1003 value = (

1004 self._codec(self.CHARSET_ALIASES.get(charset, charset))

1005 or (charset and self._codec(charset.replace("-", "")))

1006 or (charset and self._codec(charset.replace("-", "_")))

1007 or (charset and charset.lower())

1008 or charset

1009 )

1010 if value:

1011 return value.lower()

1012 return None

1013

1014 def _codec(self, charset: _Encoding) -> Optional[str]:

1015 if not charset:

1016 return charset

1017 codec = None

1018 try:

1019 codecs.lookup(charset)

1020 codec = charset

1021 except (LookupError, ValueError):

1022 pass

1023 return codec

1024

1025 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.

1026 #:

1027 #: :meta hide-value:

1028 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {

1029 b"\x80": ("euro", "20AC"),

1030 b"\x81": " ",

1031 b"\x82": ("sbquo", "201A"),

1032 b"\x83": ("fnof", "192"),

1033 b"\x84": ("bdquo", "201E"),

1034 b"\x85": ("hellip", "2026"),

1035 b"\x86": ("dagger", "2020"),

1036 b"\x87": ("Dagger", "2021"),

1037 b"\x88": ("circ", "2C6"),

1038 b"\x89": ("permil", "2030"),

1039 b"\x8a": ("Scaron", "160"),

1040 b"\x8b": ("lsaquo", "2039"),

1041 b"\x8c": ("OElig", "152"),

1042 b"\x8d": "?",

1043 b"\x8e": ("#x17D", "17D"),

1044 b"\x8f": "?",

1045 b"\x90": "?",

1046 b"\x91": ("lsquo", "2018"),

1047 b"\x92": ("rsquo", "2019"),

1048 b"\x93": ("ldquo", "201C"),

1049 b"\x94": ("rdquo", "201D"),

1050 b"\x95": ("bull", "2022"),

1051 b"\x96": ("ndash", "2013"),

1052 b"\x97": ("mdash", "2014"),

1053 b"\x98": ("tilde", "2DC"),

1054 b"\x99": ("trade", "2122"),

1055 b"\x9a": ("scaron", "161"),

1056 b"\x9b": ("rsaquo", "203A"),

1057 b"\x9c": ("oelig", "153"),

1058 b"\x9d": "?",

1059 b"\x9e": ("#x17E", "17E"),

1060 b"\x9f": ("Yuml", ""),

1061 }

1062

1063 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains

1064 #: horrors like stripping diacritical marks to turn á into a, but also

1065 #: contains non-horrors like turning “ into ".

1066 #:

1067 #: Seriously, don't use this for anything other than removing smart

1068 #: quotes.

1069 #:

1070 #: :meta private:

1071 MS_CHARS_TO_ASCII: Dict[bytes, str] = {

1072 b"\x80": "EUR",

1073 b"\x81": " ",

1074 b"\x82": ",",

1075 b"\x83": "f",

1076 b"\x84": ",,",

1077 b"\x85": "...",

1078 b"\x86": "+",

1079 b"\x87": "++",

1080 b"\x88": "^",

1081 b"\x89": "%",

1082 b"\x8a": "S",

1083 b"\x8b": "<",

1084 b"\x8c": "OE",

1085 b"\x8d": "?",

1086 b"\x8e": "Z",

1087 b"\x8f": "?",

1088 b"\x90": "?",

1089 b"\x91": "'",

1090 b"\x92": "'",

1091 b"\x93": '"',

1092 b"\x94": '"',

1093 b"\x95": "*",

1094 b"\x96": "-",

1095 b"\x97": "--",

1096 b"\x98": "~",

1097 b"\x99": "(TM)",

1098 b"\x9a": "s",

1099 b"\x9b": ">",

1100 b"\x9c": "oe",

1101 b"\x9d": "?",

1102 b"\x9e": "z",

1103 b"\x9f": "Y",

1104 b"\xa0": " ",

1105 b"\xa1": "!",

1106 b"\xa2": "c",

1107 b"\xa3": "GBP",

1108 b"\xa4": "$", # This approximation is especially parochial--this is the

1109 # generic currency symbol.

1110 b"\xa5": "YEN",

1111 b"\xa6": "|",

1112 b"\xa7": "S",

1113 b"\xa8": "..",

1114 b"\xa9": "",

1115 b"\xaa": "(th)",

1116 b"\xab": "<<",

1117 b"\xac": "!",

1118 b"\xad": " ",

1119 b"\xae": "(R)",

1120 b"\xaf": "-",

1121 b"\xb0": "o",

1122 b"\xb1": "+-",

1123 b"\xb2": "2",

1124 b"\xb3": "3",

1125 b"\xb4": "'",

1126 b"\xb5": "u",

1127 b"\xb6": "P",

1128 b"\xb7": "*",

1129 b"\xb8": ",",

1130 b"\xb9": "1",

1131 b"\xba": "(th)",

1132 b"\xbb": ">>",

1133 b"\xbc": "1/4",

1134 b"\xbd": "1/2",

1135 b"\xbe": "3/4",

1136 b"\xbf": "?",

1137 b"\xc0": "A",

1138 b"\xc1": "A",

1139 b"\xc2": "A",

1140 b"\xc3": "A",

1141 b"\xc4": "A",

1142 b"\xc5": "A",

1143 b"\xc6": "AE",

1144 b"\xc7": "C",

1145 b"\xc8": "E",

1146 b"\xc9": "E",

1147 b"\xca": "E",

1148 b"\xcb": "E",

1149 b"\xcc": "I",

1150 b"\xcd": "I",

1151 b"\xce": "I",

1152 b"\xcf": "I",

1153 b"\xd0": "D",

1154 b"\xd1": "N",

1155 b"\xd2": "O",

1156 b"\xd3": "O",

1157 b"\xd4": "O",

1158 b"\xd5": "O",

1159 b"\xd6": "O",

1160 b"\xd7": "*",

1161 b"\xd8": "O",

1162 b"\xd9": "U",

1163 b"\xda": "U",

1164 b"\xdb": "U",

1165 b"\xdc": "U",

1166 b"\xdd": "Y",

1167 b"\xde": "b",

1168 b"\xdf": "B",

1169 b"\xe0": "a",

1170 b"\xe1": "a",

1171 b"\xe2": "a",

1172 b"\xe3": "a",

1173 b"\xe4": "a",

1174 b"\xe5": "a",

1175 b"\xe6": "ae",

1176 b"\xe7": "c",

1177 b"\xe8": "e",

1178 b"\xe9": "e",

1179 b"\xea": "e",

1180 b"\xeb": "e",

1181 b"\xec": "i",

1182 b"\xed": "i",

1183 b"\xee": "i",

1184 b"\xef": "i",

1185 b"\xf0": "o",

1186 b"\xf1": "n",

1187 b"\xf2": "o",

1188 b"\xf3": "o",

1189 b"\xf4": "o",

1190 b"\xf5": "o",

1191 b"\xf6": "o",

1192 b"\xf7": "/",

1193 b"\xf8": "o",

1194 b"\xf9": "u",

1195 b"\xfa": "u",

1196 b"\xfb": "u",

1197 b"\xfc": "u",

1198 b"\xfd": "y",

1199 b"\xfe": "b",

1200 b"\xff": "y",

1201 }

1202

1203 #: A map used when removing rogue Windows-1252/ISO-8859-1

1204 #: characters in otherwise UTF-8 documents. Also used when a

1205 #: numeric character entity has been incorrectly encoded using the

1206 #: character's Windows-1252 encoding.

1207 #:

1208 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in

1209 #: Windows-1252.

1210 #:

1211 #: :meta hide-value:

1212 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {

1213 0x80: b"\xe2\x82\xac", # €

1214 0x82: b"\xe2\x80\x9a", # ‚

1215 0x83: b"\xc6\x92", # ƒ

1216 0x84: b"\xe2\x80\x9e", # „

1217 0x85: b"\xe2\x80\xa6", # …

1218 0x86: b"\xe2\x80\xa0", # †

1219 0x87: b"\xe2\x80\xa1", # ‡

1220 0x88: b"\xcb\x86", # ˆ

1221 0x89: b"\xe2\x80\xb0", # ‰

1222 0x8A: b"\xc5\xa0", # Š

1223 0x8B: b"\xe2\x80\xb9", # ‹

1224 0x8C: b"\xc5\x92", # Œ

1225 0x8E: b"\xc5\xbd", # Ž

1226 0x91: b"\xe2\x80\x98", # ‘

1227 0x92: b"\xe2\x80\x99", # ’

1228 0x93: b"\xe2\x80\x9c", # “

1229 0x94: b"\xe2\x80\x9d", # ”

1230 0x95: b"\xe2\x80\xa2", # •

1231 0x96: b"\xe2\x80\x93", # –

1232 0x97: b"\xe2\x80\x94", # —

1233 0x98: b"\xcb\x9c", # ˜

1234 0x99: b"\xe2\x84\xa2", # ™

1235 0x9A: b"\xc5\xa1", # š

1236 0x9B: b"\xe2\x80\xba", # ›

1237 0x9C: b"\xc5\x93", # œ

1238 0x9E: b"\xc5\xbe", # ž

1239 0x9F: b"\xc5\xb8", # Ÿ

1240 0xA0: b"\xc2\xa0", #

1241 0xA1: b"\xc2\xa1", # ¡

1242 0xA2: b"\xc2\xa2", # ¢

1243 0xA3: b"\xc2\xa3", # £

1244 0xA4: b"\xc2\xa4", # ¤

1245 0xA5: b"\xc2\xa5", # ¥

1246 0xA6: b"\xc2\xa6", # ¦

1247 0xA7: b"\xc2\xa7", # §

1248 0xA8: b"\xc2\xa8", # ¨

1250 0xAA: b"\xc2\xaa", # ª

1251 0xAB: b"\xc2\xab", # «

1252 0xAC: b"\xc2\xac", # ¬

1253 0xAD: b"\xc2\xad", #

1254 0xAE: b"\xc2\xae", # ®

1255 0xAF: b"\xc2\xaf", # ¯

1256 0xB0: b"\xc2\xb0", # °

1257 0xB1: b"\xc2\xb1", # ±

1258 0xB2: b"\xc2\xb2", # ²

1259 0xB3: b"\xc2\xb3", # ³

1260 0xB4: b"\xc2\xb4", # ´

1261 0xB5: b"\xc2\xb5", # µ

1262 0xB6: b"\xc2\xb6", # ¶

1263 0xB7: b"\xc2\xb7", # ·

1264 0xB8: b"\xc2\xb8", # ¸

1265 0xB9: b"\xc2\xb9", # ¹

1266 0xBA: b"\xc2\xba", # º

1267 0xBB: b"\xc2\xbb", # »

1268 0xBC: b"\xc2\xbc", # ¼

1269 0xBD: b"\xc2\xbd", # ½

1270 0xBE: b"\xc2\xbe", # ¾

1271 0xBF: b"\xc2\xbf", # ¿

1272 0xC0: b"\xc3\x80", # À

1273 0xC1: b"\xc3\x81", # Á

1274 0xC2: b"\xc3\x82", # Â

1275 0xC3: b"\xc3\x83", # Ã

1276 0xC4: b"\xc3\x84", # Ä

1277 0xC5: b"\xc3\x85", # Å

1278 0xC6: b"\xc3\x86", # Æ

1279 0xC7: b"\xc3\x87", # Ç

1280 0xC8: b"\xc3\x88", # È

1281 0xC9: b"\xc3\x89", # É

1282 0xCA: b"\xc3\x8a", # Ê

1283 0xCB: b"\xc3\x8b", # Ë

1284 0xCC: b"\xc3\x8c", # Ì

1285 0xCD: b"\xc3\x8d", # Í

1286 0xCE: b"\xc3\x8e", # Î

1287 0xCF: b"\xc3\x8f", # Ï

1288 0xD0: b"\xc3\x90", # Ð

1289 0xD1: b"\xc3\x91", # Ñ

1290 0xD2: b"\xc3\x92", # Ò

1291 0xD3: b"\xc3\x93", # Ó

1292 0xD4: b"\xc3\x94", # Ô

1293 0xD5: b"\xc3\x95", # Õ

1294 0xD6: b"\xc3\x96", # Ö

1295 0xD7: b"\xc3\x97", # ×

1296 0xD8: b"\xc3\x98", # Ø

1297 0xD9: b"\xc3\x99", # Ù

1298 0xDA: b"\xc3\x9a", # Ú

1299 0xDB: b"\xc3\x9b", # Û

1300 0xDC: b"\xc3\x9c", # Ü

1301 0xDD: b"\xc3\x9d", # Ý

1302 0xDE: b"\xc3\x9e", # Þ

1303 0xDF: b"\xc3\x9f", # ß

1304 0xE0: b"\xc3\xa0", # à

1305 0xE1: b"\xa1", # á

1306 0xE2: b"\xc3\xa2", # â

1307 0xE3: b"\xc3\xa3", # ã

1308 0xE4: b"\xc3\xa4", # ä

1309 0xE5: b"\xc3\xa5", # å

1310 0xE6: b"\xc3\xa6", # æ

1311 0xE7: b"\xc3\xa7", # ç

1312 0xE8: b"\xc3\xa8", # è

1313 0xE9: b"\xc3\xa9", # é

1314 0xEA: b"\xc3\xaa", # ê

1315 0xEB: b"\xc3\xab", # ë

1316 0xEC: b"\xc3\xac", # ì

1317 0xED: b"\xc3\xad", # í

1318 0xEE: b"\xc3\xae", # î

1319 0xEF: b"\xc3\xaf", # ï

1320 0xF0: b"\xc3\xb0", # ð

1321 0xF1: b"\xc3\xb1", # ñ

1322 0xF2: b"\xc3\xb2", # ò

1323 0xF3: b"\xc3\xb3", # ó

1324 0xF4: b"\xc3\xb4", # ô

1325 0xF5: b"\xc3\xb5", # õ

1326 0xF6: b"\xc3\xb6", # ö

1327 0xF7: b"\xc3\xb7", # ÷

1328 0xF8: b"\xc3\xb8", # ø

1329 0xF9: b"\xc3\xb9", # ù

1330 0xFA: b"\xc3\xba", # ú

1331 0xFB: b"\xc3\xbb", # û

1332 0xFC: b"\xc3\xbc", # ü

1333 0xFD: b"\xc3\xbd", # ý

1334 0xFE: b"\xc3\xbe", # þ

1335 0xFF: b"\xc3\xbf", # ÿ

1336 }

1337

1338 #: :meta private

1339 # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed.

1340 #

1341 # "A noncharacter is a code point that is in the range

1342 # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE,

1343 # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE,

1344 # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,

1345 # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE,

1346 # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,

1347 # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE,

1348 # or U+10FFFF."

1349 ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff,

1350 0x1fffe, 0x1ffff,

1351 0x2fffe, 0x2ffff,

1352 0x3fffe, 0x3ffff,

1353 0x4fffe, 0x4ffff,

1354 0x5fffe, 0x5ffff,

1355 0x6fffe, 0x6ffff,

1356 0x7fffe, 0x7ffff,

1357 0x8fffe, 0x8ffff,

1358 0x9fffe, 0x9ffff,

1359 0xafffe, 0xaffff,

1360 0xbfffe, 0xbffff,

1361 0xcfffe, 0xcffff,

1362 0xdfffe, 0xdffff,

1363 0xefffe, 0xeffff,

1364 0xffffe, 0xfffff,

1365 0x10fffe, 0x10ffff])

1366

1367 #: :meta private:

1368 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [

1369 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF

1370 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF

1371 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4

1372 ]

1373

1374 #: :meta private:

1375 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]

1376

1377 #: :meta private:

1378 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]

1379

1380 @classmethod

1381 def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]:

1382 """This (mostly) implements the algorithm described in "Numeric character

1383 reference end state" from the HTML spec:

1384 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state

1385

1386 The algorithm is designed to convert numeric character references like "☃"

1387 to Unicode characters like "☃".

1388

1389 :return: A 2-tuple (character, replaced). `character` is the Unicode

1390 character corresponding to the numeric reference and `replaced` is

1391 whether or not an unresolvable character was replaced with REPLACEMENT

1392 CHARACTER.

1393 """

1394 replacement = "\ufffd"

1395

1396 if numeric == 0x00:

1397 # "If the number is 0x00, then this is a

1398 # null-character-reference parse error. Set the character

1399 # reference code to 0xFFFD."

1400 return replacement, True

1401

1402 if numeric > 0x10ffff:

1403 # "If the number is greater than 0x10FFFF, then this is a

1404 # character-reference-outside-unicode-range parse

1405 # error. Set the character reference code to 0xFFFD."

1406 return replacement, True

1407

1408 if numeric >= 0xd800 and numeric <= 0xdfff:

1409 # "If the number is a surrogate, then this is a

1410 # surrogate-character-reference parse error. Set the

1411 # character reference code to 0xFFFD."

1412 return replacement, True

1413

1414 if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS:

1415 # "If the number is a noncharacter, then this is a

1416 # noncharacter-character-reference parse error."

1417 #

1418 # "The parser resolves such character references as-is."

1419 #

1420 # I'm not sure what "as-is" means but I think it means that we act

1421 # like there was no error condition.

1422 return chr(numeric), False

1423

1424 # "If the number is 0x0D, or a control that's not ASCII whitespace,

1425 # then this is a control-character-reference parse error."

1426 #

1427 # "A control is a C0 control or a code point in the range

1428 # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND,

1429 # inclusive."

1430 #

1431 # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive."

1432 #

1433 # "The parser resolves such character references as-is except C1 control references that are replaced."

1434

1435 # First, let's replace the control references that can be replaced.

1436 if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8:

1437 # "If the number is one of the numbers in the first column of the

1438 # following table, then find the row with that number in the first

1439 # column, and set the character reference code to the number in the

1440 # second column of that row."

1441 #

1442 # This is an attempt to catch characters that were encoded to numeric

1443 # entities using their Windows-1252 encodings rather than their UTF-8

1444 # encodings.

1445 return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False

1446

1447 # Now all that's left are references that should be resolved as-is. This

1448 # is also the default path for non-weird character references.

1449 try:

1450 return chr(numeric), False

1451 except (ValueError, OverflowError):

1452 # This shouldn't happen, since these cases should have been handled

1453 # above, but if it does, return REPLACEMENT CHARACTER

1454 return replacement, True

1455

1456 @classmethod

1457 def detwingle(

1458 cls,

1459 in_bytes: bytes,

1460 main_encoding: _Encoding = "utf8",

1461 embedded_encoding: _Encoding = "windows-1252",

1462 ) -> bytes:

1463 """Fix characters from one encoding embedded in some other encoding.

1464

1465 Currently the only situation supported is Windows-1252 (or its

1466 subset ISO-8859-1), embedded in UTF-8.

1467

1468 :param in_bytes: A bytestring that you suspect contains

1469 characters from multiple encodings. Note that this *must*

1470 be a bytestring. If you've already converted the document

1471 to Unicode, you're too late.

1472 :param main_encoding: The primary encoding of ``in_bytes``.

1473 :param embedded_encoding: The encoding that was used to embed characters

1474 in the main document.

1475 :return: A bytestring similar to ``in_bytes``, in which

1476 ``embedded_encoding`` characters have been converted to

1477 their ``main_encoding`` equivalents.

1478 """

1479 if embedded_encoding.replace("_", "-").lower() not in (

1480 "windows-1252",

1481 "windows_1252",

1482 ):

1483 raise NotImplementedError(

1484 "Windows-1252 and ISO-8859-1 are the only currently supported "

1485 "embedded encodings."

1486 )

1487

1488 if main_encoding.lower() not in ("utf8", "utf-8"):

1489 raise NotImplementedError(

1490 "UTF-8 is the only currently supported main encoding."

1491 )

1492

1493 byte_chunks = []

1494

1495 chunk_start = 0

1496 pos = 0

1497 while pos < len(in_bytes):

1498 byte = in_bytes[pos]

1499 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:

1500 # This is the start of a UTF-8 multibyte character. Skip

1501 # to the end.

1502 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:

1503 if byte >= start and byte <= end:

1504 pos += size

1505 break

1506 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:

1507 # We found a Windows-1252 character!

1508 # Save the string up to this point as a chunk.

1509 byte_chunks.append(in_bytes[chunk_start:pos])

1510

1511 # Now translate the Windows-1252 character into UTF-8

1512 # and add it as another, one-byte chunk.

1513 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])

1514 pos += 1

1515 chunk_start = pos

1516 else:

1517 # Go on to the next character.

1518 pos += 1

1519 if chunk_start == 0:

1520 # The string is unchanged.

1521 return in_bytes

1522 else:

1523 # Store the final chunk.

1524 byte_chunks.append(in_bytes[chunk_start:])

1525 return b"".join(byte_chunks)