Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's `Universal
6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
7by Kurt McKee. It does not rewrite the body of an XML or HTML document
8to reflect a new encoding; that's the job of `TreeBuilder`.
10"""
12# Use of this source code is governed by the MIT license.
13__license__ = "MIT"
15from html.entities import codepoint2name
16from collections import defaultdict
17import codecs
18from html.entities import html5
19import re
20from logging import Logger, getLogger
21from types import ModuleType
22from typing import (
23 Dict,
24 Iterator,
25 List,
26 Optional,
27 Pattern,
28 Set,
29 Tuple,
30 Type,
31 Union,
32 cast,
33)
34from typing_extensions import Literal
35from bs4._typing import (
36 _Encoding,
37 _Encodings,
38)
39import warnings
41# Import a library to autodetect character encodings. We'll support
42# any of a number of libraries that all support the same API:
43#
44# * cchardet
45# * chardet
46# * charset-normalizer
47chardet_module: Optional[ModuleType] = None
48try:
49 # PyPI package: cchardet
50 import cchardet # type:ignore
52 chardet_module = cchardet
53except ImportError:
54 try:
55 # Debian package: python-chardet
56 # PyPI package: chardet
57 import chardet
59 chardet_module = chardet
60 except ImportError:
61 try:
62 # PyPI package: charset-normalizer
63 import charset_normalizer # type:ignore
65 chardet_module = charset_normalizer
66 except ImportError:
67 # No chardet available.
68 pass
71def _chardet_dammit(s: bytes) -> Optional[str]:
72 """Try as hard as possible to detect the encoding of a bytestring."""
73 if chardet_module is None or isinstance(s, str):
74 return None
75 module = chardet_module
76 return module.detect(s)["encoding"]
79# Build bytestring and Unicode versions of regular expressions for finding
80# a declared encoding inside an XML or HTML document.
81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
82html_meta: str = (
83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
84)
86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
87encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
88encoding_res[bytes] = {
89 "html": re.compile(html_meta.encode("ascii"), re.I),
90 "xml": re.compile(xml_encoding.encode("ascii"), re.I),
91}
92encoding_res[str] = {
93 "html": re.compile(html_meta, re.I),
94 "xml": re.compile(xml_encoding, re.I),
95}
98class EntitySubstitutionMeta(type):
99 """Provides lazy access to some data structures and regular
100 expressions used by EntitySubstitution which have a measurable
101 startup cost.
102 """
103 # Trigger for
104 _CLASS_VARIABLES_POPULATED: bool = False
106 @property
107 def HTML_ENTITY_TO_CHARACTER(self) -> Dict[str, str]:
108 """A mapping of entity names like "angmsdaa" to Unicode
109 strings like "⦨".
110 """
111 if not self._CLASS_VARIABLES_POPULATED:
112 self._populate_class_variables()
113 return self._HTML_ENTITY_TO_CHARACTER
114 _HTML_ENTITY_TO_CHARACTER: Dict[str, str]
116 @property
117 def CHARACTER_TO_HTML_ENTITY(self) -> Dict[str, str]:
118 """A mapping of Unicode strings like "⦨" to entity names like
119 "angmsdaa". When a single Unicode string has multiple entity
120 names, we try to choose the most commonly-used name.
121 """
122 if not self._CLASS_VARIABLES_POPULATED:
123 self._populate_class_variables()
124 return self._CHARACTER_TO_HTML_ENTITY
125 _CHARACTER_TO_HTML_ENTITY: Dict[str, str]
127 @property
128 def CHARACTER_TO_HTML_ENTITY_RE(self) -> Pattern[str]:
129 """A regular expression matching (almost) any Unicode string
130 that corresponds to an HTML5 named entity.
131 """
133 if not self._CLASS_VARIABLES_POPULATED:
134 self._populate_class_variables()
135 return self._CHARACTER_TO_HTML_ENTITY_RE
136 _CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
138 @property
139 def CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE(self) -> Pattern[str]:
140 """A very similar regular expression to
141 CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
142 ampersands. This is used by the 'html' formatter to provide
143 backwards-compatibility, even though the HTML5 spec allows
144 most ampersands to go unescaped.
145 """
146 if not self._CLASS_VARIABLES_POPULATED:
147 self._populate_class_variables()
148 return self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE
149 _CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
151 def _populate_class_variables(self) -> None:
152 """Initialize variables used by EntitySubstitution to manage the plethora of
153 HTML and HTML5 named entities.
155 This method populates the class variables necessary to make
156 the properties defined in the metaclass work.
157 """
158 if self._CLASS_VARIABLES_POPULATED:
159 return
160 unicode_to_name = {}
161 name_to_unicode = {}
163 short_entities = set()
164 long_entities_by_first_character = defaultdict(set)
166 for name_with_semicolon, character in sorted(html5.items()):
167 # "It is intentional, for legacy compatibility, that many
168 # code points have multiple character reference names. For
169 # example, some appear both with and without the trailing
170 # semicolon, or with different capitalizations."
171 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
172 #
173 # The parsers are in charge of handling (or not) character
174 # references with no trailing semicolon, so we remove the
175 # semicolon whenever it appears.
176 if name_with_semicolon.endswith(";"):
177 name = name_with_semicolon[:-1]
178 else:
179 name = name_with_semicolon
181 # When parsing HTML, we want to recognize any known named
182 # entity and convert it to a sequence of Unicode
183 # characters.
184 if name not in name_to_unicode:
185 name_to_unicode[name] = character
187 # When _generating_ HTML, we want to recognize special
188 # character sequences that _could_ be converted to named
189 # entities.
190 unicode_to_name[character] = name
192 # We also need to build a regular expression that lets us
193 # _find_ those characters in output strings so we can
194 # replace them.
195 #
196 # This is tricky, for two reasons.
198 if len(character) == 1 and ord(character) < 128 and character not in "<>":
199 # First, it would be annoying to turn single ASCII
200 # characters like | into named entities like
201 # |. The exceptions are <>, which we _must_
202 # turn into named entities to produce valid HTML.
203 continue
205 if len(character) > 1 and all(ord(x) < 128 for x in character):
206 # We also do not want to turn _combinations_ of ASCII
207 # characters like 'fj' into named entities like 'fj',
208 # though that's more debateable.
209 continue
211 # Second, some named entities have a Unicode value that's
212 # a subset of the Unicode value for some _other_ named
213 # entity. As an example, \u2267' is ≧,
214 # but '\u2267\u0338' is ≧̸. Our regular
215 # expression needs to match the first two characters of
216 # "\u2267\u0338foo", but only the first character of
217 # "\u2267foo".
218 #
219 # In this step, we build two sets of characters that
220 # _eventually_ need to go into the regular expression. But
221 # we won't know exactly what the regular expression needs
222 # to look like until we've gone through the entire list of
223 # named entities.
224 if len(character) == 1 and character != "&":
225 short_entities.add(character)
226 else:
227 long_entities_by_first_character[character[0]].add(character)
229 # Now that we've been through the entire list of entities, we
230 # can create a regular expression that matches any of them.
231 particles = set()
232 for short in short_entities:
233 long_versions = long_entities_by_first_character[short]
234 if not long_versions:
235 particles.add(short)
236 else:
237 ignore = "".join([x[1] for x in long_versions])
238 # This finds, e.g. \u2267 but only if it is _not_
239 # followed by \u0338.
240 particles.add("%s(?![%s])" % (short, ignore))
242 for long_entities in list(long_entities_by_first_character.values()):
243 for long_entity in long_entities:
244 particles.add(long_entity)
246 re_definition = "(%s)" % "|".join(particles)
248 particles.add("&")
249 re_definition_with_ampersand = "(%s)" % "|".join(particles)
251 # If an entity shows up in both html5 and codepoint2name, it's
252 # likely that HTML5 gives it several different names, such as
253 # 'rsquo' and 'rsquor'. When converting Unicode characters to
254 # named entities, the codepoint2name name should take
255 # precedence where possible, since that's the more easily
256 # recognizable one.
257 for codepoint, name in list(codepoint2name.items()):
258 character = chr(codepoint)
259 unicode_to_name[character] = name
261 self._CHARACTER_TO_HTML_ENTITY = unicode_to_name
262 self._HTML_ENTITY_TO_CHARACTER = name_to_unicode
263 self._CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
264 self._CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
265 re_definition_with_ampersand
266 )
267 self._CLASS_VARIABLES_POPULATED = True
269class EntitySubstitution(metaclass=EntitySubstitutionMeta):
270 """The ability to substitute XML or HTML entities for certain characters."""
272 #: A map of Unicode strings to the corresponding named XML entities.
273 #:
274 #: :meta hide-value:
275 CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
276 "'": "apos",
277 '"': "quot",
278 "&": "amp",
279 "<": "lt",
280 ">": "gt",
281 }
283 # Matches any named or numeric HTML entity.
284 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
286 #: A regular expression matching an angle bracket or an ampersand that
287 #: is not part of an XML or HTML entity.
288 #:
289 #: :meta hide-value:
290 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
291 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
292 )
294 #: A regular expression matching an angle bracket or an ampersand.
295 #:
296 #: :meta hide-value:
297 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
299 @classmethod
300 def _substitute_html_entity(cls, matchobj: re.Match) -> str:
301 """Used with a regular expression to substitute the
302 appropriate HTML entity for a special character string."""
303 original_entity = matchobj.group(0)
304 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
305 if entity is None:
306 return "&%s;" % original_entity
307 return "&%s;" % entity
309 @classmethod
310 def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
311 """Used with a regular expression to substitute the
312 appropriate XML entity for a special character string."""
313 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
314 return "&%s;" % entity
316 @classmethod
317 def _escape_entity_name(cls, matchobj: re.Match) -> str:
318 return "&%s;" % matchobj.group(1)
320 @classmethod
321 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
322 possible_entity = matchobj.group(1)
323 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
324 return "&%s;" % possible_entity
325 return "&%s;" % possible_entity
327 @classmethod
328 def quoted_attribute_value(cls, value: str) -> str:
329 """Make a value into a quoted XML attribute, possibly escaping it.
331 Most strings will be quoted using double quotes.
333 Bob's Bar -> "Bob's Bar"
335 If a string contains double quotes, it will be quoted using
336 single quotes.
338 Welcome to "my bar" -> 'Welcome to "my bar"'
340 If a string contains both single and double quotes, the
341 double quotes will be escaped, and the string will be quoted
342 using double quotes.
344 Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
346 :param value: The XML attribute value to quote
347 :return: The quoted value
348 """
349 quote_with = '"'
350 if '"' in value:
351 if "'" in value:
352 # The string contains both single and double
353 # quotes. Turn the double quotes into
354 # entities. We quote the double quotes rather than
355 # the single quotes because the entity name is
356 # """ whether this is HTML or XML. If we
357 # quoted the single quotes, we'd have to decide
358 # between ' and &squot;.
359 replace_with = """
360 value = value.replace('"', replace_with)
361 else:
362 # There are double quotes but no single quotes.
363 # We can use single quotes to quote the attribute.
364 quote_with = "'"
365 return quote_with + value + quote_with
367 @classmethod
368 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
369 """Replace special XML characters with named XML entities.
371 The less-than sign will become <, the greater-than sign
372 will become >, and any ampersands will become &. If you
373 want ampersands that seem to be part of an entity definition
374 to be left alone, use `substitute_xml_containing_entities`
375 instead.
377 :param value: A string to be substituted.
379 :param make_quoted_attribute: If True, then the string will be
380 quoted, as befits an attribute value.
382 :return: A version of ``value`` with special characters replaced
383 with named entities.
384 """
385 # Escape angle brackets and ampersands.
386 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
388 if make_quoted_attribute:
389 value = cls.quoted_attribute_value(value)
390 return value
392 @classmethod
393 def substitute_xml_containing_entities(
394 cls, value: str, make_quoted_attribute: bool = False
395 ) -> str:
396 """Substitute XML entities for special XML characters.
398 :param value: A string to be substituted. The less-than sign will
399 become <, the greater-than sign will become >, and any
400 ampersands that are not part of an entity defition will
401 become &.
403 :param make_quoted_attribute: If True, then the string will be
404 quoted, as befits an attribute value.
405 """
406 # Escape angle brackets, and ampersands that aren't part of
407 # entities.
408 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
410 if make_quoted_attribute:
411 value = cls.quoted_attribute_value(value)
412 return value
414 @classmethod
415 def substitute_html(cls, s: str) -> str:
416 """Replace certain Unicode characters with named HTML entities.
418 This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
419 in that the goal is to make the result more readable (to those
420 with ASCII displays) rather than to recover from
421 errors. There's absolutely nothing wrong with a UTF-8 string
422 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
423 character with "é" will make it more readable to some
424 people.
426 :param s: The string to be modified.
427 :return: The string with some Unicode characters replaced with
428 HTML entities.
429 """
430 # Convert any appropriate characters to HTML entities.
431 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
432 cls._substitute_html_entity, s
433 )
435 @classmethod
436 def substitute_html5(cls, s: str) -> str:
437 """Replace certain Unicode characters with named HTML entities
438 using HTML5 rules.
440 Specifically, this method is much less aggressive about
441 escaping ampersands than substitute_html. Only ambiguous
442 ampersands are escaped, per the HTML5 standard:
444 "An ambiguous ampersand is a U+0026 AMPERSAND character (&)
445 that is followed by one or more ASCII alphanumerics, followed
446 by a U+003B SEMICOLON character (;), where these characters do
447 not match any of the names given in the named character
448 references section."
450 Unlike substitute_html5_raw, this method assumes HTML entities
451 were converted to Unicode characters on the way in, as
452 Beautiful Soup does. By the time Beautiful Soup does its work,
453 the only ambiguous ampersands that need to be escaped are the
454 ones that were escaped in the original markup when mentioning
455 HTML entities.
457 :param s: The string to be modified.
458 :return: The string with some Unicode characters replaced with
459 HTML entities.
460 """
461 # First, escape any HTML entities found in the markup.
462 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
464 # Next, convert any appropriate characters to unescaped HTML entities.
465 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
467 return s
469 @classmethod
470 def substitute_html5_raw(cls, s: str) -> str:
471 """Replace certain Unicode characters with named HTML entities
472 using HTML5 rules.
474 substitute_html5_raw is similar to substitute_html5 but it is
475 designed for standalone use (whereas substitute_html5 is
476 designed for use with Beautiful Soup).
478 :param s: The string to be modified.
479 :return: The string with some Unicode characters replaced with
480 HTML entities.
481 """
482 # First, escape the ampersand for anything that looks like an
483 # entity but isn't in the list of recognized entities. All other
484 # ampersands can be left alone.
485 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
487 # Then, convert a range of Unicode characters to unescaped
488 # HTML entities.
489 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
491 return s
494class EncodingDetector:
495 """This class is capable of guessing a number of possible encodings
496 for a bytestring.
498 Order of precedence:
500 1. Encodings you specifically tell EncodingDetector to try first
501 (the ``known_definite_encodings`` argument to the constructor).
503 2. An encoding determined by sniffing the document's byte-order mark.
505 3. Encodings you specifically tell EncodingDetector to try if
506 byte-order mark sniffing fails (the ``user_encodings`` argument to the
507 constructor).
509 4. An encoding declared within the bytestring itself, either in an
510 XML declaration (if the bytestring is to be interpreted as an XML
511 document), or in a <meta> tag (if the bytestring is to be
512 interpreted as an HTML document.)
514 5. An encoding detected through textual analysis by chardet,
515 cchardet, or a similar external library.
517 6. UTF-8.
519 7. Windows-1252.
521 :param markup: Some markup in an unknown encoding.
523 :param known_definite_encodings: When determining the encoding
524 of ``markup``, these encodings will be tried first, in
525 order. In HTML terms, this corresponds to the "known
526 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
528 :param user_encodings: These encodings will be tried after the
529 ``known_definite_encodings`` have been tried and failed, and
530 after an attempt to sniff the encoding by looking at a
531 byte order mark has failed. In HTML terms, this
532 corresponds to the step "user has explicitly instructed
533 the user agent to override the document's character
534 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
536 :param override_encodings: A **deprecated** alias for
537 ``known_definite_encodings``. Any encodings here will be tried
538 immediately after the encodings in
539 ``known_definite_encodings``.
541 :param is_html: If True, this markup is considered to be
542 HTML. Otherwise it's assumed to be XML.
544 :param exclude_encodings: These encodings will not be tried,
545 even if they otherwise would be.
547 """
549 def __init__(
550 self,
551 markup: bytes,
552 known_definite_encodings: Optional[_Encodings] = None,
553 is_html: Optional[bool] = False,
554 exclude_encodings: Optional[_Encodings] = None,
555 user_encodings: Optional[_Encodings] = None,
556 override_encodings: Optional[_Encodings] = None,
557 ):
558 self.known_definite_encodings = list(known_definite_encodings or [])
559 if override_encodings:
560 warnings.warn(
561 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
562 DeprecationWarning,
563 stacklevel=3,
564 )
565 self.known_definite_encodings += override_encodings
566 self.user_encodings = user_encodings or []
567 exclude_encodings = exclude_encodings or []
568 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
569 self.chardet_encoding = None
570 self.is_html = False if is_html is None else is_html
571 self.declared_encoding: Optional[str] = None
573 # First order of business: strip a byte-order mark.
574 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
576 known_definite_encodings: _Encodings
577 user_encodings: _Encodings
578 exclude_encodings: _Encodings
579 chardet_encoding: Optional[_Encoding]
580 is_html: bool
581 declared_encoding: Optional[_Encoding]
582 markup: bytes
583 sniffed_encoding: Optional[_Encoding]
585 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
586 """Should we even bother to try this encoding?
588 :param encoding: Name of an encoding.
589 :param tried: Encodings that have already been tried. This
590 will be modified as a side effect.
591 """
592 if encoding is None:
593 return False
594 encoding = encoding.lower()
595 if encoding in self.exclude_encodings:
596 return False
597 if encoding not in tried:
598 tried.add(encoding)
599 return True
600 return False
602 @property
603 def encodings(self) -> Iterator[_Encoding]:
604 """Yield a number of encodings that might work for this markup.
606 :yield: A sequence of strings. Each is the name of an encoding
607 that *might* work to convert a bytestring into Unicode.
608 """
609 tried: Set[_Encoding] = set()
611 # First, try the known definite encodings
612 for e in self.known_definite_encodings:
613 if self._usable(e, tried):
614 yield e
616 # Did the document originally start with a byte-order mark
617 # that indicated its encoding?
618 if self.sniffed_encoding is not None and self._usable(
619 self.sniffed_encoding, tried
620 ):
621 yield self.sniffed_encoding
623 # Sniffing the byte-order mark did nothing; try the user
624 # encodings.
625 for e in self.user_encodings:
626 if self._usable(e, tried):
627 yield e
629 # Look within the document for an XML or HTML encoding
630 # declaration.
631 if self.declared_encoding is None:
632 self.declared_encoding = self.find_declared_encoding(
633 self.markup, self.is_html
634 )
635 if self.declared_encoding is not None and self._usable(
636 self.declared_encoding, tried
637 ):
638 yield self.declared_encoding
640 # Use third-party character set detection to guess at the
641 # encoding.
642 if self.chardet_encoding is None:
643 self.chardet_encoding = _chardet_dammit(self.markup)
644 if self.chardet_encoding is not None and self._usable(
645 self.chardet_encoding, tried
646 ):
647 yield self.chardet_encoding
649 # As a last-ditch effort, try utf-8 and windows-1252.
650 for e in ("utf-8", "windows-1252"):
651 if self._usable(e, tried):
652 yield e
654 @classmethod
655 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
656 """If a byte-order mark is present, strip it and return the encoding it implies.
658 :param data: A bytestring that may or may not begin with a
659 byte-order mark.
661 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
662 """
663 encoding = None
664 if isinstance(data, str):
665 # Unicode data cannot have a byte-order mark.
666 return data, encoding
667 if (
668 (len(data) >= 4)
669 and (data[:2] == b"\xfe\xff")
670 and (data[2:4] != b"\x00\x00")
671 ):
672 encoding = "utf-16be"
673 data = data[2:]
674 elif (
675 (len(data) >= 4)
676 and (data[:2] == b"\xff\xfe")
677 and (data[2:4] != b"\x00\x00")
678 ):
679 encoding = "utf-16le"
680 data = data[2:]
681 elif data[:3] == b"\xef\xbb\xbf":
682 encoding = "utf-8"
683 data = data[3:]
684 elif data[:4] == b"\x00\x00\xfe\xff":
685 encoding = "utf-32be"
686 data = data[4:]
687 elif data[:4] == b"\xff\xfe\x00\x00":
688 encoding = "utf-32le"
689 data = data[4:]
690 return data, encoding
692 @classmethod
693 def find_declared_encoding(
694 cls,
695 markup: Union[bytes, str],
696 is_html: bool = False,
697 search_entire_document: bool = False,
698 ) -> Optional[_Encoding]:
699 """Given a document, tries to find an encoding declared within the
700 text of the document itself.
702 An XML encoding is declared at the beginning of the document.
704 An HTML encoding is declared in a <meta> tag, hopefully near the
705 beginning of the document.
707 :param markup: Some markup.
708 :param is_html: If True, this markup is considered to be HTML. Otherwise
709 it's assumed to be XML.
710 :param search_entire_document: Since an encoding is supposed
711 to declared near the beginning of the document, most of
712 the time it's only necessary to search a few kilobytes of
713 data. Set this to True to force this method to search the
714 entire document.
715 :return: The declared encoding, if one is found.
716 """
717 if search_entire_document:
718 xml_endpos = html_endpos = len(markup)
719 else:
720 xml_endpos = 1024
721 html_endpos = max(2048, int(len(markup) * 0.05))
723 if isinstance(markup, bytes):
724 res = encoding_res[bytes]
725 else:
726 res = encoding_res[str]
728 xml_re = res["xml"]
729 html_re = res["html"]
730 declared_encoding: Optional[_Encoding] = None
731 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
732 if not declared_encoding_match and is_html:
733 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
734 if declared_encoding_match is not None:
735 declared_encoding = declared_encoding_match.groups()[0]
736 if declared_encoding:
737 if isinstance(declared_encoding, bytes):
738 declared_encoding = declared_encoding.decode("ascii", "replace")
739 return declared_encoding.lower()
740 return None
743class UnicodeDammit:
744 """A class for detecting the encoding of a bytestring containing an
745 HTML or XML document, and decoding it to Unicode. If the source
746 encoding is windows-1252, `UnicodeDammit` can also replace
747 Microsoft smart quotes with their HTML or XML equivalents.
749 :param markup: HTML or XML markup in an unknown encoding.
751 :param known_definite_encodings: When determining the encoding
752 of ``markup``, these encodings will be tried first, in
753 order. In HTML terms, this corresponds to the "known
754 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
756 :param user_encodings: These encodings will be tried after the
757 ``known_definite_encodings`` have been tried and failed, and
758 after an attempt to sniff the encoding by looking at a
759 byte order mark has failed. In HTML terms, this
760 corresponds to the step "user has explicitly instructed
761 the user agent to override the document's character
762 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
764 :param override_encodings: A **deprecated** alias for
765 ``known_definite_encodings``. Any encodings here will be tried
766 immediately after the encodings in
767 ``known_definite_encodings``.
769 :param smart_quotes_to: By default, Microsoft smart quotes will,
770 like all other characters, be converted to Unicode
771 characters. Setting this to ``ascii`` will convert them to ASCII
772 quotes instead. Setting it to ``xml`` will convert them to XML
773 entity references, and setting it to ``html`` will convert them
774 to HTML entity references.
776 :param is_html: If True, ``markup`` is treated as an HTML
777 document. Otherwise it's treated as an XML document.
779 :param exclude_encodings: These encodings will not be considered,
780 even if the sniffing code thinks they might make sense.
782 """
784 def __init__(
785 self,
786 markup: bytes,
787 known_definite_encodings: Optional[_Encodings] = [],
788 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
789 is_html: bool = False,
790 exclude_encodings: Optional[_Encodings] = [],
791 user_encodings: Optional[_Encodings] = None,
792 override_encodings: Optional[_Encodings] = None,
793 ):
794 self.smart_quotes_to = smart_quotes_to
795 self.tried_encodings = []
796 self.contains_replacement_characters = False
797 self.is_html = is_html
798 self.log = getLogger(__name__)
799 self.detector = EncodingDetector(
800 markup,
801 known_definite_encodings,
802 is_html,
803 exclude_encodings,
804 user_encodings,
805 override_encodings,
806 )
808 # Short-circuit if the data is in Unicode to begin with.
809 if isinstance(markup, str):
810 self.markup = markup.encode("utf8")
811 self.unicode_markup = markup
812 self.original_encoding = None
813 return
815 # The encoding detector may have stripped a byte-order mark.
816 # Use the stripped markup from this point on.
817 self.markup = self.detector.markup
819 u = None
820 for encoding in self.detector.encodings:
821 markup = self.detector.markup
822 u = self._convert_from(encoding)
823 if u is not None:
824 break
826 if not u:
827 # None of the encodings worked. As an absolute last resort,
828 # try them again with character replacement.
830 for encoding in self.detector.encodings:
831 if encoding != "ascii":
832 u = self._convert_from(encoding, "replace")
833 if u is not None:
834 self.log.warning(
835 "Some characters could not be decoded, and were "
836 "replaced with REPLACEMENT CHARACTER."
837 )
839 self.contains_replacement_characters = True
840 break
842 # If none of that worked, we could at this point force it to
843 # ASCII, but that would destroy so much data that I think
844 # giving up is better.
845 #
846 # Note that this is extremely unlikely, probably impossible,
847 # because the "replace" strategy is so powerful. Even running
848 # the Python binary through Unicode, Dammit gives you Unicode,
849 # albeit Unicode riddled with REPLACEMENT CHARACTER.
850 if u is None:
851 self.original_encoding = None
852 self.unicode_markup = None
853 else:
854 self.unicode_markup = u
856 #: The original markup, before it was converted to Unicode.
857 #: This is not necessarily the same as what was passed in to the
858 #: constructor, since any byte-order mark will be stripped.
859 markup: bytes
861 #: The Unicode version of the markup, following conversion. This
862 #: is set to None if there was simply no way to convert the
863 #: bytestring to Unicode (as with binary data).
864 unicode_markup: Optional[str]
866 #: This is True if `UnicodeDammit.unicode_markup` contains
867 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
868 #: in `UnicodeDammit.markup`. These mark character sequences that
869 #: could not be represented in Unicode.
870 contains_replacement_characters: bool
872 #: Unicode, Dammit's best guess as to the original character
873 #: encoding of `UnicodeDammit.markup`.
874 original_encoding: Optional[_Encoding]
876 #: The strategy used to handle Microsoft smart quotes.
877 smart_quotes_to: Optional[str]
879 #: The (encoding, error handling strategy) 2-tuples that were used to
880 #: try and convert the markup to Unicode.
881 tried_encodings: List[Tuple[_Encoding, str]]
883 log: Logger #: :meta private:
885 def _sub_ms_char(self, match: re.Match) -> bytes:
886 """Changes a MS smart quote character to an XML or HTML
887 entity, or an ASCII character.
889 TODO: Since this is only used to convert smart quotes, it
890 could be simplified, and MS_CHARS_TO_ASCII made much less
891 parochial.
892 """
893 orig: bytes = match.group(1)
894 sub: bytes
895 if self.smart_quotes_to == "ascii":
896 if orig in self.MS_CHARS_TO_ASCII:
897 sub = self.MS_CHARS_TO_ASCII[orig].encode()
898 else:
899 # Shouldn't happen; substitute the character
900 # with itself.
901 sub = orig
902 else:
903 if orig in self.MS_CHARS:
904 substitutions = self.MS_CHARS[orig]
905 if type(substitutions) is tuple:
906 if self.smart_quotes_to == "xml":
907 sub = b"&#x" + substitutions[1].encode() + b";"
908 else:
909 sub = b"&" + substitutions[0].encode() + b";"
910 else:
911 substitutions = cast(str, substitutions)
912 sub = substitutions.encode()
913 else:
914 # Shouldn't happen; substitute the character
915 # for itself.
916 sub = orig
917 return sub
919 #: This dictionary maps commonly seen values for "charset" in HTML
920 #: meta tags to the corresponding Python codec names. It only covers
921 #: values that aren't in Python's aliases and can't be determined
922 #: by the heuristics in `find_codec`.
923 #:
924 #: :meta hide-value:
925 CHARSET_ALIASES: Dict[str, _Encoding] = {
926 "macintosh": "mac-roman",
927 "x-sjis": "shift-jis",
928 }
930 #: A list of encodings that tend to contain Microsoft smart quotes.
931 #:
932 #: :meta hide-value:
933 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
934 "windows-1252",
935 "iso-8859-1",
936 "iso-8859-2",
937 ]
939 def _convert_from(
940 self, proposed: _Encoding, errors: str = "strict"
941 ) -> Optional[str]:
942 """Attempt to convert the markup to the proposed encoding.
944 :param proposed: The name of a character encoding.
945 :param errors: An error handling strategy, used when calling `str`.
946 :return: The converted markup, or `None` if the proposed
947 encoding/error handling strategy didn't work.
948 """
949 lookup_result = self.find_codec(proposed)
950 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
951 return None
952 proposed = lookup_result
953 self.tried_encodings.append((proposed, errors))
954 markup = self.markup
955 # Convert smart quotes to HTML if coming from an encoding
956 # that might have them.
957 if (
958 self.smart_quotes_to is not None
959 and proposed in self.ENCODINGS_WITH_SMART_QUOTES
960 ):
961 smart_quotes_re = b"([\x80-\x9f])"
962 smart_quotes_compiled = re.compile(smart_quotes_re)
963 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
965 try:
966 # print("Trying to convert document to %s (errors=%s)" % (
967 # proposed, errors))
968 u = self._to_unicode(markup, proposed, errors)
969 self.unicode_markup = u
970 self.original_encoding = proposed
971 except Exception:
972 # print("That didn't work!")
973 # print(e)
974 return None
975 # print("Correct encoding: %s" % proposed)
976 return self.unicode_markup
978 def _to_unicode(
979 self, data: bytes, encoding: _Encoding, errors: str = "strict"
980 ) -> str:
981 """Given a bytestring and its encoding, decodes the string into Unicode.
983 :param encoding: The name of an encoding.
984 :param errors: An error handling strategy, used when calling `str`.
985 """
986 return str(data, encoding, errors)
988 @property
989 def declared_html_encoding(self) -> Optional[_Encoding]:
990 """If the markup is an HTML document, returns the encoding, if any,
991 declared *inside* the document.
992 """
993 if not self.is_html:
994 return None
995 return self.detector.declared_encoding
997 def find_codec(self, charset: _Encoding) -> Optional[str]:
998 """Look up the Python codec corresponding to a given character set.
1000 :param charset: The name of a character set.
1001 :return: The name of a Python codec.
1002 """
1003 value = (
1004 self._codec(self.CHARSET_ALIASES.get(charset, charset))
1005 or (charset and self._codec(charset.replace("-", "")))
1006 or (charset and self._codec(charset.replace("-", "_")))
1007 or (charset and charset.lower())
1008 or charset
1009 )
1010 if value:
1011 return value.lower()
1012 return None
1014 def _codec(self, charset: _Encoding) -> Optional[str]:
1015 if not charset:
1016 return charset
1017 codec = None
1018 try:
1019 codecs.lookup(charset)
1020 codec = charset
1021 except (LookupError, ValueError):
1022 pass
1023 return codec
1025 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
1026 #:
1027 #: :meta hide-value:
1028 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
1029 b"\x80": ("euro", "20AC"),
1030 b"\x81": " ",
1031 b"\x82": ("sbquo", "201A"),
1032 b"\x83": ("fnof", "192"),
1033 b"\x84": ("bdquo", "201E"),
1034 b"\x85": ("hellip", "2026"),
1035 b"\x86": ("dagger", "2020"),
1036 b"\x87": ("Dagger", "2021"),
1037 b"\x88": ("circ", "2C6"),
1038 b"\x89": ("permil", "2030"),
1039 b"\x8a": ("Scaron", "160"),
1040 b"\x8b": ("lsaquo", "2039"),
1041 b"\x8c": ("OElig", "152"),
1042 b"\x8d": "?",
1043 b"\x8e": ("#x17D", "17D"),
1044 b"\x8f": "?",
1045 b"\x90": "?",
1046 b"\x91": ("lsquo", "2018"),
1047 b"\x92": ("rsquo", "2019"),
1048 b"\x93": ("ldquo", "201C"),
1049 b"\x94": ("rdquo", "201D"),
1050 b"\x95": ("bull", "2022"),
1051 b"\x96": ("ndash", "2013"),
1052 b"\x97": ("mdash", "2014"),
1053 b"\x98": ("tilde", "2DC"),
1054 b"\x99": ("trade", "2122"),
1055 b"\x9a": ("scaron", "161"),
1056 b"\x9b": ("rsaquo", "203A"),
1057 b"\x9c": ("oelig", "153"),
1058 b"\x9d": "?",
1059 b"\x9e": ("#x17E", "17E"),
1060 b"\x9f": ("Yuml", ""),
1061 }
1063 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
1064 #: horrors like stripping diacritical marks to turn á into a, but also
1065 #: contains non-horrors like turning “ into ".
1066 #:
1067 #: Seriously, don't use this for anything other than removing smart
1068 #: quotes.
1069 #:
1070 #: :meta private:
1071 MS_CHARS_TO_ASCII: Dict[bytes, str] = {
1072 b"\x80": "EUR",
1073 b"\x81": " ",
1074 b"\x82": ",",
1075 b"\x83": "f",
1076 b"\x84": ",,",
1077 b"\x85": "...",
1078 b"\x86": "+",
1079 b"\x87": "++",
1080 b"\x88": "^",
1081 b"\x89": "%",
1082 b"\x8a": "S",
1083 b"\x8b": "<",
1084 b"\x8c": "OE",
1085 b"\x8d": "?",
1086 b"\x8e": "Z",
1087 b"\x8f": "?",
1088 b"\x90": "?",
1089 b"\x91": "'",
1090 b"\x92": "'",
1091 b"\x93": '"',
1092 b"\x94": '"',
1093 b"\x95": "*",
1094 b"\x96": "-",
1095 b"\x97": "--",
1096 b"\x98": "~",
1097 b"\x99": "(TM)",
1098 b"\x9a": "s",
1099 b"\x9b": ">",
1100 b"\x9c": "oe",
1101 b"\x9d": "?",
1102 b"\x9e": "z",
1103 b"\x9f": "Y",
1104 b"\xa0": " ",
1105 b"\xa1": "!",
1106 b"\xa2": "c",
1107 b"\xa3": "GBP",
1108 b"\xa4": "$", # This approximation is especially parochial--this is the
1109 # generic currency symbol.
1110 b"\xa5": "YEN",
1111 b"\xa6": "|",
1112 b"\xa7": "S",
1113 b"\xa8": "..",
1114 b"\xa9": "",
1115 b"\xaa": "(th)",
1116 b"\xab": "<<",
1117 b"\xac": "!",
1118 b"\xad": " ",
1119 b"\xae": "(R)",
1120 b"\xaf": "-",
1121 b"\xb0": "o",
1122 b"\xb1": "+-",
1123 b"\xb2": "2",
1124 b"\xb3": "3",
1125 b"\xb4": "'",
1126 b"\xb5": "u",
1127 b"\xb6": "P",
1128 b"\xb7": "*",
1129 b"\xb8": ",",
1130 b"\xb9": "1",
1131 b"\xba": "(th)",
1132 b"\xbb": ">>",
1133 b"\xbc": "1/4",
1134 b"\xbd": "1/2",
1135 b"\xbe": "3/4",
1136 b"\xbf": "?",
1137 b"\xc0": "A",
1138 b"\xc1": "A",
1139 b"\xc2": "A",
1140 b"\xc3": "A",
1141 b"\xc4": "A",
1142 b"\xc5": "A",
1143 b"\xc6": "AE",
1144 b"\xc7": "C",
1145 b"\xc8": "E",
1146 b"\xc9": "E",
1147 b"\xca": "E",
1148 b"\xcb": "E",
1149 b"\xcc": "I",
1150 b"\xcd": "I",
1151 b"\xce": "I",
1152 b"\xcf": "I",
1153 b"\xd0": "D",
1154 b"\xd1": "N",
1155 b"\xd2": "O",
1156 b"\xd3": "O",
1157 b"\xd4": "O",
1158 b"\xd5": "O",
1159 b"\xd6": "O",
1160 b"\xd7": "*",
1161 b"\xd8": "O",
1162 b"\xd9": "U",
1163 b"\xda": "U",
1164 b"\xdb": "U",
1165 b"\xdc": "U",
1166 b"\xdd": "Y",
1167 b"\xde": "b",
1168 b"\xdf": "B",
1169 b"\xe0": "a",
1170 b"\xe1": "a",
1171 b"\xe2": "a",
1172 b"\xe3": "a",
1173 b"\xe4": "a",
1174 b"\xe5": "a",
1175 b"\xe6": "ae",
1176 b"\xe7": "c",
1177 b"\xe8": "e",
1178 b"\xe9": "e",
1179 b"\xea": "e",
1180 b"\xeb": "e",
1181 b"\xec": "i",
1182 b"\xed": "i",
1183 b"\xee": "i",
1184 b"\xef": "i",
1185 b"\xf0": "o",
1186 b"\xf1": "n",
1187 b"\xf2": "o",
1188 b"\xf3": "o",
1189 b"\xf4": "o",
1190 b"\xf5": "o",
1191 b"\xf6": "o",
1192 b"\xf7": "/",
1193 b"\xf8": "o",
1194 b"\xf9": "u",
1195 b"\xfa": "u",
1196 b"\xfb": "u",
1197 b"\xfc": "u",
1198 b"\xfd": "y",
1199 b"\xfe": "b",
1200 b"\xff": "y",
1201 }
1203 #: A map used when removing rogue Windows-1252/ISO-8859-1
1204 #: characters in otherwise UTF-8 documents. Also used when a
1205 #: numeric character entity has been incorrectly encoded using the
1206 #: character's Windows-1252 encoding.
1207 #:
1208 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
1209 #: Windows-1252.
1210 #:
1211 #: :meta hide-value:
1212 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
1213 0x80: b"\xe2\x82\xac", # €
1214 0x82: b"\xe2\x80\x9a", # ‚
1215 0x83: b"\xc6\x92", # ƒ
1216 0x84: b"\xe2\x80\x9e", # „
1217 0x85: b"\xe2\x80\xa6", # …
1218 0x86: b"\xe2\x80\xa0", # †
1219 0x87: b"\xe2\x80\xa1", # ‡
1220 0x88: b"\xcb\x86", # ˆ
1221 0x89: b"\xe2\x80\xb0", # ‰
1222 0x8A: b"\xc5\xa0", # Š
1223 0x8B: b"\xe2\x80\xb9", # ‹
1224 0x8C: b"\xc5\x92", # Œ
1225 0x8E: b"\xc5\xbd", # Ž
1226 0x91: b"\xe2\x80\x98", # ‘
1227 0x92: b"\xe2\x80\x99", # ’
1228 0x93: b"\xe2\x80\x9c", # “
1229 0x94: b"\xe2\x80\x9d", # ”
1230 0x95: b"\xe2\x80\xa2", # •
1231 0x96: b"\xe2\x80\x93", # –
1232 0x97: b"\xe2\x80\x94", # —
1233 0x98: b"\xcb\x9c", # ˜
1234 0x99: b"\xe2\x84\xa2", # ™
1235 0x9A: b"\xc5\xa1", # š
1236 0x9B: b"\xe2\x80\xba", # ›
1237 0x9C: b"\xc5\x93", # œ
1238 0x9E: b"\xc5\xbe", # ž
1239 0x9F: b"\xc5\xb8", # Ÿ
1240 0xA0: b"\xc2\xa0", #
1241 0xA1: b"\xc2\xa1", # ¡
1242 0xA2: b"\xc2\xa2", # ¢
1243 0xA3: b"\xc2\xa3", # £
1244 0xA4: b"\xc2\xa4", # ¤
1245 0xA5: b"\xc2\xa5", # ¥
1246 0xA6: b"\xc2\xa6", # ¦
1247 0xA7: b"\xc2\xa7", # §
1248 0xA8: b"\xc2\xa8", # ¨
1249 0xA9: b"\xc2\xa9", # ©
1250 0xAA: b"\xc2\xaa", # ª
1251 0xAB: b"\xc2\xab", # «
1252 0xAC: b"\xc2\xac", # ¬
1253 0xAD: b"\xc2\xad", #
1254 0xAE: b"\xc2\xae", # ®
1255 0xAF: b"\xc2\xaf", # ¯
1256 0xB0: b"\xc2\xb0", # °
1257 0xB1: b"\xc2\xb1", # ±
1258 0xB2: b"\xc2\xb2", # ²
1259 0xB3: b"\xc2\xb3", # ³
1260 0xB4: b"\xc2\xb4", # ´
1261 0xB5: b"\xc2\xb5", # µ
1262 0xB6: b"\xc2\xb6", # ¶
1263 0xB7: b"\xc2\xb7", # ·
1264 0xB8: b"\xc2\xb8", # ¸
1265 0xB9: b"\xc2\xb9", # ¹
1266 0xBA: b"\xc2\xba", # º
1267 0xBB: b"\xc2\xbb", # »
1268 0xBC: b"\xc2\xbc", # ¼
1269 0xBD: b"\xc2\xbd", # ½
1270 0xBE: b"\xc2\xbe", # ¾
1271 0xBF: b"\xc2\xbf", # ¿
1272 0xC0: b"\xc3\x80", # À
1273 0xC1: b"\xc3\x81", # Á
1274 0xC2: b"\xc3\x82", # Â
1275 0xC3: b"\xc3\x83", # Ã
1276 0xC4: b"\xc3\x84", # Ä
1277 0xC5: b"\xc3\x85", # Å
1278 0xC6: b"\xc3\x86", # Æ
1279 0xC7: b"\xc3\x87", # Ç
1280 0xC8: b"\xc3\x88", # È
1281 0xC9: b"\xc3\x89", # É
1282 0xCA: b"\xc3\x8a", # Ê
1283 0xCB: b"\xc3\x8b", # Ë
1284 0xCC: b"\xc3\x8c", # Ì
1285 0xCD: b"\xc3\x8d", # Í
1286 0xCE: b"\xc3\x8e", # Î
1287 0xCF: b"\xc3\x8f", # Ï
1288 0xD0: b"\xc3\x90", # Ð
1289 0xD1: b"\xc3\x91", # Ñ
1290 0xD2: b"\xc3\x92", # Ò
1291 0xD3: b"\xc3\x93", # Ó
1292 0xD4: b"\xc3\x94", # Ô
1293 0xD5: b"\xc3\x95", # Õ
1294 0xD6: b"\xc3\x96", # Ö
1295 0xD7: b"\xc3\x97", # ×
1296 0xD8: b"\xc3\x98", # Ø
1297 0xD9: b"\xc3\x99", # Ù
1298 0xDA: b"\xc3\x9a", # Ú
1299 0xDB: b"\xc3\x9b", # Û
1300 0xDC: b"\xc3\x9c", # Ü
1301 0xDD: b"\xc3\x9d", # Ý
1302 0xDE: b"\xc3\x9e", # Þ
1303 0xDF: b"\xc3\x9f", # ß
1304 0xE0: b"\xc3\xa0", # à
1305 0xE1: b"\xa1", # á
1306 0xE2: b"\xc3\xa2", # â
1307 0xE3: b"\xc3\xa3", # ã
1308 0xE4: b"\xc3\xa4", # ä
1309 0xE5: b"\xc3\xa5", # å
1310 0xE6: b"\xc3\xa6", # æ
1311 0xE7: b"\xc3\xa7", # ç
1312 0xE8: b"\xc3\xa8", # è
1313 0xE9: b"\xc3\xa9", # é
1314 0xEA: b"\xc3\xaa", # ê
1315 0xEB: b"\xc3\xab", # ë
1316 0xEC: b"\xc3\xac", # ì
1317 0xED: b"\xc3\xad", # í
1318 0xEE: b"\xc3\xae", # î
1319 0xEF: b"\xc3\xaf", # ï
1320 0xF0: b"\xc3\xb0", # ð
1321 0xF1: b"\xc3\xb1", # ñ
1322 0xF2: b"\xc3\xb2", # ò
1323 0xF3: b"\xc3\xb3", # ó
1324 0xF4: b"\xc3\xb4", # ô
1325 0xF5: b"\xc3\xb5", # õ
1326 0xF6: b"\xc3\xb6", # ö
1327 0xF7: b"\xc3\xb7", # ÷
1328 0xF8: b"\xc3\xb8", # ø
1329 0xF9: b"\xc3\xb9", # ù
1330 0xFA: b"\xc3\xba", # ú
1331 0xFB: b"\xc3\xbb", # û
1332 0xFC: b"\xc3\xbc", # ü
1333 0xFD: b"\xc3\xbd", # ý
1334 0xFE: b"\xc3\xbe", # þ
1335 0xFF: b"\xc3\xbf", # ÿ
1336 }
1338 #: :meta private
1339 # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed.
1340 #
1341 # "A noncharacter is a code point that is in the range
1342 # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE,
1343 # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE,
1344 # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
1345 # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE,
1346 # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
1347 # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE,
1348 # or U+10FFFF."
1349 ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff,
1350 0x1fffe, 0x1ffff,
1351 0x2fffe, 0x2ffff,
1352 0x3fffe, 0x3ffff,
1353 0x4fffe, 0x4ffff,
1354 0x5fffe, 0x5ffff,
1355 0x6fffe, 0x6ffff,
1356 0x7fffe, 0x7ffff,
1357 0x8fffe, 0x8ffff,
1358 0x9fffe, 0x9ffff,
1359 0xafffe, 0xaffff,
1360 0xbfffe, 0xbffff,
1361 0xcfffe, 0xcffff,
1362 0xdfffe, 0xdffff,
1363 0xefffe, 0xeffff,
1364 0xffffe, 0xfffff,
1365 0x10fffe, 0x10ffff])
1367 #: :meta private:
1368 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
1369 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
1370 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
1371 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
1372 ]
1374 #: :meta private:
1375 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
1377 #: :meta private:
1378 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
1380 @classmethod
1381 def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]:
1382 """This (mostly) implements the algorithm described in "Numeric character
1383 reference end state" from the HTML spec:
1384 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
1386 The algorithm is designed to convert numeric character references like "☃"
1387 to Unicode characters like "☃".
1389 :return: A 2-tuple (character, replaced). `character` is the Unicode
1390 character corresponding to the numeric reference and `replaced` is
1391 whether or not an unresolvable character was replaced with REPLACEMENT
1392 CHARACTER.
1393 """
1394 replacement = "\ufffd"
1396 if numeric == 0x00:
1397 # "If the number is 0x00, then this is a
1398 # null-character-reference parse error. Set the character
1399 # reference code to 0xFFFD."
1400 return replacement, True
1402 if numeric > 0x10ffff:
1403 # "If the number is greater than 0x10FFFF, then this is a
1404 # character-reference-outside-unicode-range parse
1405 # error. Set the character reference code to 0xFFFD."
1406 return replacement, True
1408 if numeric >= 0xd800 and numeric <= 0xdfff:
1409 # "If the number is a surrogate, then this is a
1410 # surrogate-character-reference parse error. Set the
1411 # character reference code to 0xFFFD."
1412 return replacement, True
1414 if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS:
1415 # "If the number is a noncharacter, then this is a
1416 # noncharacter-character-reference parse error."
1417 #
1418 # "The parser resolves such character references as-is."
1419 #
1420 # I'm not sure what "as-is" means but I think it means that we act
1421 # like there was no error condition.
1422 return chr(numeric), False
1424 # "If the number is 0x0D, or a control that's not ASCII whitespace,
1425 # then this is a control-character-reference parse error."
1426 #
1427 # "A control is a C0 control or a code point in the range
1428 # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND,
1429 # inclusive."
1430 #
1431 # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive."
1432 #
1433 # "The parser resolves such character references as-is except C1 control references that are replaced."
1435 # First, let's replace the control references that can be replaced.
1436 if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8:
1437 # "If the number is one of the numbers in the first column of the
1438 # following table, then find the row with that number in the first
1439 # column, and set the character reference code to the number in the
1440 # second column of that row."
1441 #
1442 # This is an attempt to catch characters that were encoded to numeric
1443 # entities using their Windows-1252 encodings rather than their UTF-8
1444 # encodings.
1445 return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False
1447 # Now all that's left are references that should be resolved as-is. This
1448 # is also the default path for non-weird character references.
1449 try:
1450 return chr(numeric), False
1451 except (ValueError, OverflowError):
1452 # This shouldn't happen, since these cases should have been handled
1453 # above, but if it does, return REPLACEMENT CHARACTER
1454 return replacement, True
1456 @classmethod
1457 def detwingle(
1458 cls,
1459 in_bytes: bytes,
1460 main_encoding: _Encoding = "utf8",
1461 embedded_encoding: _Encoding = "windows-1252",
1462 ) -> bytes:
1463 """Fix characters from one encoding embedded in some other encoding.
1465 Currently the only situation supported is Windows-1252 (or its
1466 subset ISO-8859-1), embedded in UTF-8.
1468 :param in_bytes: A bytestring that you suspect contains
1469 characters from multiple encodings. Note that this *must*
1470 be a bytestring. If you've already converted the document
1471 to Unicode, you're too late.
1472 :param main_encoding: The primary encoding of ``in_bytes``.
1473 :param embedded_encoding: The encoding that was used to embed characters
1474 in the main document.
1475 :return: A bytestring similar to ``in_bytes``, in which
1476 ``embedded_encoding`` characters have been converted to
1477 their ``main_encoding`` equivalents.
1478 """
1479 if embedded_encoding.replace("_", "-").lower() not in (
1480 "windows-1252",
1481 "windows_1252",
1482 ):
1483 raise NotImplementedError(
1484 "Windows-1252 and ISO-8859-1 are the only currently supported "
1485 "embedded encodings."
1486 )
1488 if main_encoding.lower() not in ("utf8", "utf-8"):
1489 raise NotImplementedError(
1490 "UTF-8 is the only currently supported main encoding."
1491 )
1493 byte_chunks = []
1495 chunk_start = 0
1496 pos = 0
1497 while pos < len(in_bytes):
1498 byte = in_bytes[pos]
1499 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
1500 # This is the start of a UTF-8 multibyte character. Skip
1501 # to the end.
1502 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
1503 if byte >= start and byte <= end:
1504 pos += size
1505 break
1506 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
1507 # We found a Windows-1252 character!
1508 # Save the string up to this point as a chunk.
1509 byte_chunks.append(in_bytes[chunk_start:pos])
1511 # Now translate the Windows-1252 character into UTF-8
1512 # and add it as another, one-byte chunk.
1513 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
1514 pos += 1
1515 chunk_start = pos
1516 else:
1517 # Go on to the next character.
1518 pos += 1
1519 if chunk_start == 0:
1520 # The string is unchanged.
1521 return in_bytes
1522 else:
1523 # Store the final chunk.
1524 byte_chunks.append(in_bytes[chunk_start:])
1525 return b"".join(byte_chunks)