Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 41%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's `Universal
6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
7by Kurt McKee. It does not rewrite the body of an XML or HTML document
8to reflect a new encoding; that's the job of `TreeBuilder`.
10"""
12# Use of this source code is governed by the MIT license.
13__license__ = "MIT"
15from html.entities import codepoint2name
16from collections import defaultdict
17import codecs
18from html.entities import html5
19import re
20from logging import Logger, getLogger
21from types import ModuleType
22from typing import (
23 Dict,
24 Iterator,
25 List,
26 Optional,
27 Pattern,
28 Set,
29 Tuple,
30 Type,
31 Union,
32 cast,
33)
34from typing_extensions import Literal
35from bs4._typing import (
36 _Encoding,
37 _Encodings,
38)
39import warnings
41# Import a library to autodetect character encodings. We'll support
42# any of a number of libraries that all support the same API:
43#
44# * cchardet
45# * chardet
46# * charset-normalizer
47chardet_module: Optional[ModuleType] = None
48try:
49 # PyPI package: cchardet
50 import cchardet # type:ignore
52 chardet_module = cchardet
53except ImportError:
54 try:
55 # Debian package: python-chardet
56 # PyPI package: chardet
57 import chardet
59 chardet_module = chardet
60 except ImportError:
61 try:
62 # PyPI package: charset-normalizer
63 import charset_normalizer # type:ignore
65 chardet_module = charset_normalizer
66 except ImportError:
67 # No chardet available.
68 pass
71def _chardet_dammit(s: bytes) -> Optional[str]:
72 """Try as hard as possible to detect the encoding of a bytestring."""
73 if chardet_module is None or isinstance(s, str):
74 return None
75 module = chardet_module
76 return module.detect(s)["encoding"]
79# Build bytestring and Unicode versions of regular expressions for finding
80# a declared encoding inside an XML or HTML document.
81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
82html_meta: str = (
83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
84)
86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
87encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
88encoding_res[bytes] = {
89 "html": re.compile(html_meta.encode("ascii"), re.I),
90 "xml": re.compile(xml_encoding.encode("ascii"), re.I),
91}
92encoding_res[str] = {
93 "html": re.compile(html_meta, re.I),
94 "xml": re.compile(xml_encoding, re.I),
95}
98class EntitySubstitution(object):
99 """The ability to substitute XML or HTML entities for certain characters."""
101 #: A map of named HTML entities to the corresponding Unicode string.
102 #:
103 #: :meta hide-value:
104 HTML_ENTITY_TO_CHARACTER: Dict[str, str]
106 #: A map of Unicode strings to the corresponding named HTML entities;
107 #: the inverse of HTML_ENTITY_TO_CHARACTER.
108 #:
109 #: :meta hide-value:
110 CHARACTER_TO_HTML_ENTITY: Dict[str, str]
112 #: A regular expression that matches any character (or, in rare
113 #: cases, pair of characters) that can be replaced with a named
114 #: HTML entity.
115 #:
116 #: :meta hide-value:
117 CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
119 #: A very similar regular expression to
120 #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
121 #: ampersands. This is used by the 'html' formatted to provide
122 #: backwards-compatibility, even though the HTML5 spec allows most
123 #: ampersands to go unescaped.
124 #:
125 #: :meta hide-value:
126 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
128 @classmethod
129 def _populate_class_variables(cls) -> None:
130 """Initialize variables used by this class to manage the plethora of
131 HTML5 named entities.
133 This function sets the following class variables:
135 CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
136 entity names like "angmsdaa". When a single Unicode string has
137 multiple entity names, we try to choose the most commonly-used
138 name.
140 HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
141 Unicode strings like "⦨".
143 CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
144 Unicode string that corresponds to an HTML5 named entity.
146 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
147 regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
148 also matches unescaped ampersands. This is used by the 'html'
149 formatted to provide backwards-compatibility, even though the HTML5
150 spec allows most ampersands to go unescaped.
151 """
152 unicode_to_name = {}
153 name_to_unicode = {}
155 short_entities = set()
156 long_entities_by_first_character = defaultdict(set)
158 for name_with_semicolon, character in sorted(html5.items()):
159 # "It is intentional, for legacy compatibility, that many
160 # code points have multiple character reference names. For
161 # example, some appear both with and without the trailing
162 # semicolon, or with different capitalizations."
163 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
164 #
165 # The parsers are in charge of handling (or not) character
166 # references with no trailing semicolon, so we remove the
167 # semicolon whenever it appears.
168 if name_with_semicolon.endswith(";"):
169 name = name_with_semicolon[:-1]
170 else:
171 name = name_with_semicolon
173 # When parsing HTML, we want to recognize any known named
174 # entity and convert it to a sequence of Unicode
175 # characters.
176 if name not in name_to_unicode:
177 name_to_unicode[name] = character
179 # When _generating_ HTML, we want to recognize special
180 # character sequences that _could_ be converted to named
181 # entities.
182 unicode_to_name[character] = name
184 # We also need to build a regular expression that lets us
185 # _find_ those characters in output strings so we can
186 # replace them.
187 #
188 # This is tricky, for two reasons.
190 if len(character) == 1 and ord(character) < 128 and character not in "<>":
191 # First, it would be annoying to turn single ASCII
192 # characters like | into named entities like
193 # |. The exceptions are <>, which we _must_
194 # turn into named entities to produce valid HTML.
195 continue
197 if len(character) > 1 and all(ord(x) < 128 for x in character):
198 # We also do not want to turn _combinations_ of ASCII
199 # characters like 'fj' into named entities like 'fj',
200 # though that's more debateable.
201 continue
203 # Second, some named entities have a Unicode value that's
204 # a subset of the Unicode value for some _other_ named
205 # entity. As an example, \u2267' is ≧,
206 # but '\u2267\u0338' is ≧̸. Our regular
207 # expression needs to match the first two characters of
208 # "\u2267\u0338foo", but only the first character of
209 # "\u2267foo".
210 #
211 # In this step, we build two sets of characters that
212 # _eventually_ need to go into the regular expression. But
213 # we won't know exactly what the regular expression needs
214 # to look like until we've gone through the entire list of
215 # named entities.
216 if len(character) == 1 and character != "&":
217 short_entities.add(character)
218 else:
219 long_entities_by_first_character[character[0]].add(character)
221 # Now that we've been through the entire list of entities, we
222 # can create a regular expression that matches any of them.
223 particles = set()
224 for short in short_entities:
225 long_versions = long_entities_by_first_character[short]
226 if not long_versions:
227 particles.add(short)
228 else:
229 ignore = "".join([x[1] for x in long_versions])
230 # This finds, e.g. \u2267 but only if it is _not_
231 # followed by \u0338.
232 particles.add("%s(?![%s])" % (short, ignore))
234 for long_entities in list(long_entities_by_first_character.values()):
235 for long_entity in long_entities:
236 particles.add(long_entity)
238 re_definition = "(%s)" % "|".join(particles)
240 particles.add("&")
241 re_definition_with_ampersand = "(%s)" % "|".join(particles)
243 # If an entity shows up in both html5 and codepoint2name, it's
244 # likely that HTML5 gives it several different names, such as
245 # 'rsquo' and 'rsquor'. When converting Unicode characters to
246 # named entities, the codepoint2name name should take
247 # precedence where possible, since that's the more easily
248 # recognizable one.
249 for codepoint, name in list(codepoint2name.items()):
250 character = chr(codepoint)
251 unicode_to_name[character] = name
253 cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
254 cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
255 cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
256 cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
257 re_definition_with_ampersand
258 )
260 #: A map of Unicode strings to the corresponding named XML entities.
261 #:
262 #: :meta hide-value:
263 CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
264 "'": "apos",
265 '"': "quot",
266 "&": "amp",
267 "<": "lt",
268 ">": "gt",
269 }
271 # Matches any named or numeric HTML entity.
272 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
274 #: A regular expression matching an angle bracket or an ampersand that
275 #: is not part of an XML or HTML entity.
276 #:
277 #: :meta hide-value:
278 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
279 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
280 )
282 #: A regular expression matching an angle bracket or an ampersand.
283 #:
284 #: :meta hide-value:
285 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
287 @classmethod
288 def _substitute_html_entity(cls, matchobj: re.Match) -> str:
289 """Used with a regular expression to substitute the
290 appropriate HTML entity for a special character string."""
291 original_entity = matchobj.group(0)
292 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
293 if entity is None:
294 return "&%s;" % original_entity
295 return "&%s;" % entity
297 @classmethod
298 def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
299 """Used with a regular expression to substitute the
300 appropriate XML entity for a special character string."""
301 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
302 return "&%s;" % entity
304 @classmethod
305 def _escape_entity_name(cls, matchobj: re.Match) -> str:
306 return "&%s;" % matchobj.group(1)
308 @classmethod
309 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
310 possible_entity = matchobj.group(1)
311 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
312 return "&%s;" % possible_entity
313 return "&%s;" % possible_entity
315 @classmethod
316 def quoted_attribute_value(cls, value: str) -> str:
317 """Make a value into a quoted XML attribute, possibly escaping it.
319 Most strings will be quoted using double quotes.
321 Bob's Bar -> "Bob's Bar"
323 If a string contains double quotes, it will be quoted using
324 single quotes.
326 Welcome to "my bar" -> 'Welcome to "my bar"'
328 If a string contains both single and double quotes, the
329 double quotes will be escaped, and the string will be quoted
330 using double quotes.
332 Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
334 :param value: The XML attribute value to quote
335 :return: The quoted value
336 """
337 quote_with = '"'
338 if '"' in value:
339 if "'" in value:
340 # The string contains both single and double
341 # quotes. Turn the double quotes into
342 # entities. We quote the double quotes rather than
343 # the single quotes because the entity name is
344 # """ whether this is HTML or XML. If we
345 # quoted the single quotes, we'd have to decide
346 # between ' and &squot;.
347 replace_with = """
348 value = value.replace('"', replace_with)
349 else:
350 # There are double quotes but no single quotes.
351 # We can use single quotes to quote the attribute.
352 quote_with = "'"
353 return quote_with + value + quote_with
355 @classmethod
356 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
357 """Replace special XML characters with named XML entities.
359 The less-than sign will become <, the greater-than sign
360 will become >, and any ampersands will become &. If you
361 want ampersands that seem to be part of an entity definition
362 to be left alone, use `substitute_xml_containing_entities`
363 instead.
365 :param value: A string to be substituted.
367 :param make_quoted_attribute: If True, then the string will be
368 quoted, as befits an attribute value.
370 :return: A version of ``value`` with special characters replaced
371 with named entities.
372 """
373 # Escape angle brackets and ampersands.
374 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
376 if make_quoted_attribute:
377 value = cls.quoted_attribute_value(value)
378 return value
380 @classmethod
381 def substitute_xml_containing_entities(
382 cls, value: str, make_quoted_attribute: bool = False
383 ) -> str:
384 """Substitute XML entities for special XML characters.
386 :param value: A string to be substituted. The less-than sign will
387 become <, the greater-than sign will become >, and any
388 ampersands that are not part of an entity defition will
389 become &.
391 :param make_quoted_attribute: If True, then the string will be
392 quoted, as befits an attribute value.
393 """
394 # Escape angle brackets, and ampersands that aren't part of
395 # entities.
396 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
398 if make_quoted_attribute:
399 value = cls.quoted_attribute_value(value)
400 return value
402 @classmethod
403 def substitute_html(cls, s: str) -> str:
404 """Replace certain Unicode characters with named HTML entities.
406 This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
407 in that the goal is to make the result more readable (to those
408 with ASCII displays) rather than to recover from
409 errors. There's absolutely nothing wrong with a UTF-8 string
410 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
411 character with "é" will make it more readable to some
412 people.
414 :param s: The string to be modified.
415 :return: The string with some Unicode characters replaced with
416 HTML entities.
417 """
418 # Convert any appropriate characters to HTML entities.
419 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
420 cls._substitute_html_entity, s
421 )
423 @classmethod
424 def substitute_html5(cls, s: str) -> str:
425 """Replace certain Unicode characters with named HTML entities
426 using HTML5 rules.
428 Specifically, this method is much less aggressive about
429 escaping ampersands than substitute_html. Only ambiguous
430 ampersands are escaped, per the HTML5 standard:
432 "An ambiguous ampersand is a U+0026 AMPERSAND character (&)
433 that is followed by one or more ASCII alphanumerics, followed
434 by a U+003B SEMICOLON character (;), where these characters do
435 not match any of the names given in the named character
436 references section."
438 Unlike substitute_html5_raw, this method assumes HTML entities
439 were converted to Unicode characters on the way in, as
440 Beautiful Soup does. By the time Beautiful Soup does its work,
441 the only ambiguous ampersands that need to be escaped are the
442 ones that were escaped in the original markup when mentioning
443 HTML entities.
445 :param s: The string to be modified.
446 :return: The string with some Unicode characters replaced with
447 HTML entities.
448 """
449 # First, escape any HTML entities found in the markup.
450 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
452 # Next, convert any appropriate characters to unescaped HTML entities.
453 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
455 return s
457 @classmethod
458 def substitute_html5_raw(cls, s: str) -> str:
459 """Replace certain Unicode characters with named HTML entities
460 using HTML5 rules.
462 substitute_html5_raw is similar to substitute_html5 but it is
463 designed for standalone use (whereas substitute_html5 is
464 designed for use with Beautiful Soup).
466 :param s: The string to be modified.
467 :return: The string with some Unicode characters replaced with
468 HTML entities.
469 """
470 # First, escape the ampersand for anything that looks like an
471 # entity but isn't in the list of recognized entities. All other
472 # ampersands can be left alone.
473 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
475 # Then, convert a range of Unicode characters to unescaped
476 # HTML entities.
477 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
479 return s
482EntitySubstitution._populate_class_variables()
485class EncodingDetector:
486 """This class is capable of guessing a number of possible encodings
487 for a bytestring.
489 Order of precedence:
491 1. Encodings you specifically tell EncodingDetector to try first
492 (the ``known_definite_encodings`` argument to the constructor).
494 2. An encoding determined by sniffing the document's byte-order mark.
496 3. Encodings you specifically tell EncodingDetector to try if
497 byte-order mark sniffing fails (the ``user_encodings`` argument to the
498 constructor).
500 4. An encoding declared within the bytestring itself, either in an
501 XML declaration (if the bytestring is to be interpreted as an XML
502 document), or in a <meta> tag (if the bytestring is to be
503 interpreted as an HTML document.)
505 5. An encoding detected through textual analysis by chardet,
506 cchardet, or a similar external library.
508 6. UTF-8.
510 7. Windows-1252.
512 :param markup: Some markup in an unknown encoding.
514 :param known_definite_encodings: When determining the encoding
515 of ``markup``, these encodings will be tried first, in
516 order. In HTML terms, this corresponds to the "known
517 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
519 :param user_encodings: These encodings will be tried after the
520 ``known_definite_encodings`` have been tried and failed, and
521 after an attempt to sniff the encoding by looking at a
522 byte order mark has failed. In HTML terms, this
523 corresponds to the step "user has explicitly instructed
524 the user agent to override the document's character
525 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
527 :param override_encodings: A **deprecated** alias for
528 ``known_definite_encodings``. Any encodings here will be tried
529 immediately after the encodings in
530 ``known_definite_encodings``.
532 :param is_html: If True, this markup is considered to be
533 HTML. Otherwise it's assumed to be XML.
535 :param exclude_encodings: These encodings will not be tried,
536 even if they otherwise would be.
538 """
540 def __init__(
541 self,
542 markup: bytes,
543 known_definite_encodings: Optional[_Encodings] = None,
544 is_html: Optional[bool] = False,
545 exclude_encodings: Optional[_Encodings] = None,
546 user_encodings: Optional[_Encodings] = None,
547 override_encodings: Optional[_Encodings] = None,
548 ):
549 self.known_definite_encodings = list(known_definite_encodings or [])
550 if override_encodings:
551 warnings.warn(
552 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
553 DeprecationWarning,
554 stacklevel=3,
555 )
556 self.known_definite_encodings += override_encodings
557 self.user_encodings = user_encodings or []
558 exclude_encodings = exclude_encodings or []
559 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
560 self.chardet_encoding = None
561 self.is_html = False if is_html is None else is_html
562 self.declared_encoding: Optional[str] = None
564 # First order of business: strip a byte-order mark.
565 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
567 known_definite_encodings: _Encodings
568 user_encodings: _Encodings
569 exclude_encodings: _Encodings
570 chardet_encoding: Optional[_Encoding]
571 is_html: bool
572 declared_encoding: Optional[_Encoding]
573 markup: bytes
574 sniffed_encoding: Optional[_Encoding]
576 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
577 """Should we even bother to try this encoding?
579 :param encoding: Name of an encoding.
580 :param tried: Encodings that have already been tried. This
581 will be modified as a side effect.
582 """
583 if encoding is None:
584 return False
585 encoding = encoding.lower()
586 if encoding in self.exclude_encodings:
587 return False
588 if encoding not in tried:
589 tried.add(encoding)
590 return True
591 return False
593 @property
594 def encodings(self) -> Iterator[_Encoding]:
595 """Yield a number of encodings that might work for this markup.
597 :yield: A sequence of strings. Each is the name of an encoding
598 that *might* work to convert a bytestring into Unicode.
599 """
600 tried: Set[_Encoding] = set()
602 # First, try the known definite encodings
603 for e in self.known_definite_encodings:
604 if self._usable(e, tried):
605 yield e
607 # Did the document originally start with a byte-order mark
608 # that indicated its encoding?
609 if self.sniffed_encoding is not None and self._usable(
610 self.sniffed_encoding, tried
611 ):
612 yield self.sniffed_encoding
614 # Sniffing the byte-order mark did nothing; try the user
615 # encodings.
616 for e in self.user_encodings:
617 if self._usable(e, tried):
618 yield e
620 # Look within the document for an XML or HTML encoding
621 # declaration.
622 if self.declared_encoding is None:
623 self.declared_encoding = self.find_declared_encoding(
624 self.markup, self.is_html
625 )
626 if self.declared_encoding is not None and self._usable(
627 self.declared_encoding, tried
628 ):
629 yield self.declared_encoding
631 # Use third-party character set detection to guess at the
632 # encoding.
633 if self.chardet_encoding is None:
634 self.chardet_encoding = _chardet_dammit(self.markup)
635 if self.chardet_encoding is not None and self._usable(
636 self.chardet_encoding, tried
637 ):
638 yield self.chardet_encoding
640 # As a last-ditch effort, try utf-8 and windows-1252.
641 for e in ("utf-8", "windows-1252"):
642 if self._usable(e, tried):
643 yield e
645 @classmethod
646 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
647 """If a byte-order mark is present, strip it and return the encoding it implies.
649 :param data: A bytestring that may or may not begin with a
650 byte-order mark.
652 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
653 """
654 encoding = None
655 if isinstance(data, str):
656 # Unicode data cannot have a byte-order mark.
657 return data, encoding
658 if (
659 (len(data) >= 4)
660 and (data[:2] == b"\xfe\xff")
661 and (data[2:4] != b"\x00\x00")
662 ):
663 encoding = "utf-16be"
664 data = data[2:]
665 elif (
666 (len(data) >= 4)
667 and (data[:2] == b"\xff\xfe")
668 and (data[2:4] != b"\x00\x00")
669 ):
670 encoding = "utf-16le"
671 data = data[2:]
672 elif data[:3] == b"\xef\xbb\xbf":
673 encoding = "utf-8"
674 data = data[3:]
675 elif data[:4] == b"\x00\x00\xfe\xff":
676 encoding = "utf-32be"
677 data = data[4:]
678 elif data[:4] == b"\xff\xfe\x00\x00":
679 encoding = "utf-32le"
680 data = data[4:]
681 return data, encoding
683 @classmethod
684 def find_declared_encoding(
685 cls,
686 markup: Union[bytes, str],
687 is_html: bool = False,
688 search_entire_document: bool = False,
689 ) -> Optional[_Encoding]:
690 """Given a document, tries to find an encoding declared within the
691 text of the document itself.
693 An XML encoding is declared at the beginning of the document.
695 An HTML encoding is declared in a <meta> tag, hopefully near the
696 beginning of the document.
698 :param markup: Some markup.
699 :param is_html: If True, this markup is considered to be HTML. Otherwise
700 it's assumed to be XML.
701 :param search_entire_document: Since an encoding is supposed
702 to declared near the beginning of the document, most of
703 the time it's only necessary to search a few kilobytes of
704 data. Set this to True to force this method to search the
705 entire document.
706 :return: The declared encoding, if one is found.
707 """
708 if search_entire_document:
709 xml_endpos = html_endpos = len(markup)
710 else:
711 xml_endpos = 1024
712 html_endpos = max(2048, int(len(markup) * 0.05))
714 if isinstance(markup, bytes):
715 res = encoding_res[bytes]
716 else:
717 res = encoding_res[str]
719 xml_re = res["xml"]
720 html_re = res["html"]
721 declared_encoding: Optional[_Encoding] = None
722 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
723 if not declared_encoding_match and is_html:
724 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
725 if declared_encoding_match is not None:
726 declared_encoding = declared_encoding_match.groups()[0]
727 if declared_encoding:
728 if isinstance(declared_encoding, bytes):
729 declared_encoding = declared_encoding.decode("ascii", "replace")
730 return declared_encoding.lower()
731 return None
734class UnicodeDammit:
735 """A class for detecting the encoding of a bytestring containing an
736 HTML or XML document, and decoding it to Unicode. If the source
737 encoding is windows-1252, `UnicodeDammit` can also replace
738 Microsoft smart quotes with their HTML or XML equivalents.
740 :param markup: HTML or XML markup in an unknown encoding.
742 :param known_definite_encodings: When determining the encoding
743 of ``markup``, these encodings will be tried first, in
744 order. In HTML terms, this corresponds to the "known
745 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
747 :param user_encodings: These encodings will be tried after the
748 ``known_definite_encodings`` have been tried and failed, and
749 after an attempt to sniff the encoding by looking at a
750 byte order mark has failed. In HTML terms, this
751 corresponds to the step "user has explicitly instructed
752 the user agent to override the document's character
753 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
755 :param override_encodings: A **deprecated** alias for
756 ``known_definite_encodings``. Any encodings here will be tried
757 immediately after the encodings in
758 ``known_definite_encodings``.
760 :param smart_quotes_to: By default, Microsoft smart quotes will,
761 like all other characters, be converted to Unicode
762 characters. Setting this to ``ascii`` will convert them to ASCII
763 quotes instead. Setting it to ``xml`` will convert them to XML
764 entity references, and setting it to ``html`` will convert them
765 to HTML entity references.
767 :param is_html: If True, ``markup`` is treated as an HTML
768 document. Otherwise it's treated as an XML document.
770 :param exclude_encodings: These encodings will not be considered,
771 even if the sniffing code thinks they might make sense.
773 """
775 def __init__(
776 self,
777 markup: bytes,
778 known_definite_encodings: Optional[_Encodings] = [],
779 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
780 is_html: bool = False,
781 exclude_encodings: Optional[_Encodings] = [],
782 user_encodings: Optional[_Encodings] = None,
783 override_encodings: Optional[_Encodings] = None,
784 ):
785 self.smart_quotes_to = smart_quotes_to
786 self.tried_encodings = []
787 self.contains_replacement_characters = False
788 self.is_html = is_html
789 self.log = getLogger(__name__)
790 self.detector = EncodingDetector(
791 markup,
792 known_definite_encodings,
793 is_html,
794 exclude_encodings,
795 user_encodings,
796 override_encodings,
797 )
799 # Short-circuit if the data is in Unicode to begin with.
800 if isinstance(markup, str):
801 self.markup = markup.encode("utf8")
802 self.unicode_markup = markup
803 self.original_encoding = None
804 return
806 # The encoding detector may have stripped a byte-order mark.
807 # Use the stripped markup from this point on.
808 self.markup = self.detector.markup
810 u = None
811 for encoding in self.detector.encodings:
812 markup = self.detector.markup
813 u = self._convert_from(encoding)
814 if u is not None:
815 break
817 if not u:
818 # None of the encodings worked. As an absolute last resort,
819 # try them again with character replacement.
821 for encoding in self.detector.encodings:
822 if encoding != "ascii":
823 u = self._convert_from(encoding, "replace")
824 if u is not None:
825 self.log.warning(
826 "Some characters could not be decoded, and were "
827 "replaced with REPLACEMENT CHARACTER."
828 )
830 self.contains_replacement_characters = True
831 break
833 # If none of that worked, we could at this point force it to
834 # ASCII, but that would destroy so much data that I think
835 # giving up is better.
836 #
837 # Note that this is extremely unlikely, probably impossible,
838 # because the "replace" strategy is so powerful. Even running
839 # the Python binary through Unicode, Dammit gives you Unicode,
840 # albeit Unicode riddled with REPLACEMENT CHARACTER.
841 if u is None:
842 self.original_encoding = None
843 self.unicode_markup = None
844 else:
845 self.unicode_markup = u
847 #: The original markup, before it was converted to Unicode.
848 #: This is not necessarily the same as what was passed in to the
849 #: constructor, since any byte-order mark will be stripped.
850 markup: bytes
852 #: The Unicode version of the markup, following conversion. This
853 #: is set to None if there was simply no way to convert the
854 #: bytestring to Unicode (as with binary data).
855 unicode_markup: Optional[str]
857 #: This is True if `UnicodeDammit.unicode_markup` contains
858 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
859 #: in `UnicodeDammit.markup`. These mark character sequences that
860 #: could not be represented in Unicode.
861 contains_replacement_characters: bool
863 #: Unicode, Dammit's best guess as to the original character
864 #: encoding of `UnicodeDammit.markup`.
865 original_encoding: Optional[_Encoding]
867 #: The strategy used to handle Microsoft smart quotes.
868 smart_quotes_to: Optional[str]
870 #: The (encoding, error handling strategy) 2-tuples that were used to
871 #: try and convert the markup to Unicode.
872 tried_encodings: List[Tuple[_Encoding, str]]
874 log: Logger #: :meta private:
876 def _sub_ms_char(self, match: re.Match) -> bytes:
877 """Changes a MS smart quote character to an XML or HTML
878 entity, or an ASCII character.
880 TODO: Since this is only used to convert smart quotes, it
881 could be simplified, and MS_CHARS_TO_ASCII made much less
882 parochial.
883 """
884 orig: bytes = match.group(1)
885 sub: bytes
886 if self.smart_quotes_to == "ascii":
887 if orig in self.MS_CHARS_TO_ASCII:
888 sub = self.MS_CHARS_TO_ASCII[orig].encode()
889 else:
890 # Shouldn't happen; substitute the character
891 # with itself.
892 sub = orig
893 else:
894 if orig in self.MS_CHARS:
895 substitutions = self.MS_CHARS[orig]
896 if type(substitutions) is tuple:
897 if self.smart_quotes_to == "xml":
898 sub = b"&#x" + substitutions[1].encode() + b";"
899 else:
900 sub = b"&" + substitutions[0].encode() + b";"
901 else:
902 substitutions = cast(str, substitutions)
903 sub = substitutions.encode()
904 else:
905 # Shouldn't happen; substitute the character
906 # for itself.
907 sub = orig
908 return sub
910 #: This dictionary maps commonly seen values for "charset" in HTML
911 #: meta tags to the corresponding Python codec names. It only covers
912 #: values that aren't in Python's aliases and can't be determined
913 #: by the heuristics in `find_codec`.
914 #:
915 #: :meta hide-value:
916 CHARSET_ALIASES: Dict[str, _Encoding] = {
917 "macintosh": "mac-roman",
918 "x-sjis": "shift-jis",
919 }
921 #: A list of encodings that tend to contain Microsoft smart quotes.
922 #:
923 #: :meta hide-value:
924 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
925 "windows-1252",
926 "iso-8859-1",
927 "iso-8859-2",
928 ]
930 def _convert_from(
931 self, proposed: _Encoding, errors: str = "strict"
932 ) -> Optional[str]:
933 """Attempt to convert the markup to the proposed encoding.
935 :param proposed: The name of a character encoding.
936 :param errors: An error handling strategy, used when calling `str`.
937 :return: The converted markup, or `None` if the proposed
938 encoding/error handling strategy didn't work.
939 """
940 lookup_result = self.find_codec(proposed)
941 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
942 return None
943 proposed = lookup_result
944 self.tried_encodings.append((proposed, errors))
945 markup = self.markup
946 # Convert smart quotes to HTML if coming from an encoding
947 # that might have them.
948 if (
949 self.smart_quotes_to is not None
950 and proposed in self.ENCODINGS_WITH_SMART_QUOTES
951 ):
952 smart_quotes_re = b"([\x80-\x9f])"
953 smart_quotes_compiled = re.compile(smart_quotes_re)
954 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
956 try:
957 # print("Trying to convert document to %s (errors=%s)" % (
958 # proposed, errors))
959 u = self._to_unicode(markup, proposed, errors)
960 self.unicode_markup = u
961 self.original_encoding = proposed
962 except Exception:
963 # print("That didn't work!")
964 # print(e)
965 return None
966 # print("Correct encoding: %s" % proposed)
967 return self.unicode_markup
969 def _to_unicode(
970 self, data: bytes, encoding: _Encoding, errors: str = "strict"
971 ) -> str:
972 """Given a bytestring and its encoding, decodes the string into Unicode.
974 :param encoding: The name of an encoding.
975 :param errors: An error handling strategy, used when calling `str`.
976 """
977 return str(data, encoding, errors)
979 @property
980 def declared_html_encoding(self) -> Optional[_Encoding]:
981 """If the markup is an HTML document, returns the encoding, if any,
982 declared *inside* the document.
983 """
984 if not self.is_html:
985 return None
986 return self.detector.declared_encoding
988 def find_codec(self, charset: _Encoding) -> Optional[str]:
989 """Look up the Python codec corresponding to a given character set.
991 :param charset: The name of a character set.
992 :return: The name of a Python codec.
993 """
994 value = (
995 self._codec(self.CHARSET_ALIASES.get(charset, charset))
996 or (charset and self._codec(charset.replace("-", "")))
997 or (charset and self._codec(charset.replace("-", "_")))
998 or (charset and charset.lower())
999 or charset
1000 )
1001 if value:
1002 return value.lower()
1003 return None
1005 def _codec(self, charset: _Encoding) -> Optional[str]:
1006 if not charset:
1007 return charset
1008 codec = None
1009 try:
1010 codecs.lookup(charset)
1011 codec = charset
1012 except (LookupError, ValueError):
1013 pass
1014 return codec
1016 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
1017 #:
1018 #: :meta hide-value:
1019 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
1020 b"\x80": ("euro", "20AC"),
1021 b"\x81": " ",
1022 b"\x82": ("sbquo", "201A"),
1023 b"\x83": ("fnof", "192"),
1024 b"\x84": ("bdquo", "201E"),
1025 b"\x85": ("hellip", "2026"),
1026 b"\x86": ("dagger", "2020"),
1027 b"\x87": ("Dagger", "2021"),
1028 b"\x88": ("circ", "2C6"),
1029 b"\x89": ("permil", "2030"),
1030 b"\x8a": ("Scaron", "160"),
1031 b"\x8b": ("lsaquo", "2039"),
1032 b"\x8c": ("OElig", "152"),
1033 b"\x8d": "?",
1034 b"\x8e": ("#x17D", "17D"),
1035 b"\x8f": "?",
1036 b"\x90": "?",
1037 b"\x91": ("lsquo", "2018"),
1038 b"\x92": ("rsquo", "2019"),
1039 b"\x93": ("ldquo", "201C"),
1040 b"\x94": ("rdquo", "201D"),
1041 b"\x95": ("bull", "2022"),
1042 b"\x96": ("ndash", "2013"),
1043 b"\x97": ("mdash", "2014"),
1044 b"\x98": ("tilde", "2DC"),
1045 b"\x99": ("trade", "2122"),
1046 b"\x9a": ("scaron", "161"),
1047 b"\x9b": ("rsaquo", "203A"),
1048 b"\x9c": ("oelig", "153"),
1049 b"\x9d": "?",
1050 b"\x9e": ("#x17E", "17E"),
1051 b"\x9f": ("Yuml", ""),
1052 }
1054 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
1055 #: horrors like stripping diacritical marks to turn á into a, but also
1056 #: contains non-horrors like turning “ into ".
1057 #:
1058 #: Seriously, don't use this for anything other than removing smart
1059 #: quotes.
1060 #:
1061 #: :meta private:
1062 MS_CHARS_TO_ASCII: Dict[bytes, str] = {
1063 b"\x80": "EUR",
1064 b"\x81": " ",
1065 b"\x82": ",",
1066 b"\x83": "f",
1067 b"\x84": ",,",
1068 b"\x85": "...",
1069 b"\x86": "+",
1070 b"\x87": "++",
1071 b"\x88": "^",
1072 b"\x89": "%",
1073 b"\x8a": "S",
1074 b"\x8b": "<",
1075 b"\x8c": "OE",
1076 b"\x8d": "?",
1077 b"\x8e": "Z",
1078 b"\x8f": "?",
1079 b"\x90": "?",
1080 b"\x91": "'",
1081 b"\x92": "'",
1082 b"\x93": '"',
1083 b"\x94": '"',
1084 b"\x95": "*",
1085 b"\x96": "-",
1086 b"\x97": "--",
1087 b"\x98": "~",
1088 b"\x99": "(TM)",
1089 b"\x9a": "s",
1090 b"\x9b": ">",
1091 b"\x9c": "oe",
1092 b"\x9d": "?",
1093 b"\x9e": "z",
1094 b"\x9f": "Y",
1095 b"\xa0": " ",
1096 b"\xa1": "!",
1097 b"\xa2": "c",
1098 b"\xa3": "GBP",
1099 b"\xa4": "$", # This approximation is especially parochial--this is the
1100 # generic currency symbol.
1101 b"\xa5": "YEN",
1102 b"\xa6": "|",
1103 b"\xa7": "S",
1104 b"\xa8": "..",
1105 b"\xa9": "",
1106 b"\xaa": "(th)",
1107 b"\xab": "<<",
1108 b"\xac": "!",
1109 b"\xad": " ",
1110 b"\xae": "(R)",
1111 b"\xaf": "-",
1112 b"\xb0": "o",
1113 b"\xb1": "+-",
1114 b"\xb2": "2",
1115 b"\xb3": "3",
1116 b"\xb4": "'",
1117 b"\xb5": "u",
1118 b"\xb6": "P",
1119 b"\xb7": "*",
1120 b"\xb8": ",",
1121 b"\xb9": "1",
1122 b"\xba": "(th)",
1123 b"\xbb": ">>",
1124 b"\xbc": "1/4",
1125 b"\xbd": "1/2",
1126 b"\xbe": "3/4",
1127 b"\xbf": "?",
1128 b"\xc0": "A",
1129 b"\xc1": "A",
1130 b"\xc2": "A",
1131 b"\xc3": "A",
1132 b"\xc4": "A",
1133 b"\xc5": "A",
1134 b"\xc6": "AE",
1135 b"\xc7": "C",
1136 b"\xc8": "E",
1137 b"\xc9": "E",
1138 b"\xca": "E",
1139 b"\xcb": "E",
1140 b"\xcc": "I",
1141 b"\xcd": "I",
1142 b"\xce": "I",
1143 b"\xcf": "I",
1144 b"\xd0": "D",
1145 b"\xd1": "N",
1146 b"\xd2": "O",
1147 b"\xd3": "O",
1148 b"\xd4": "O",
1149 b"\xd5": "O",
1150 b"\xd6": "O",
1151 b"\xd7": "*",
1152 b"\xd8": "O",
1153 b"\xd9": "U",
1154 b"\xda": "U",
1155 b"\xdb": "U",
1156 b"\xdc": "U",
1157 b"\xdd": "Y",
1158 b"\xde": "b",
1159 b"\xdf": "B",
1160 b"\xe0": "a",
1161 b"\xe1": "a",
1162 b"\xe2": "a",
1163 b"\xe3": "a",
1164 b"\xe4": "a",
1165 b"\xe5": "a",
1166 b"\xe6": "ae",
1167 b"\xe7": "c",
1168 b"\xe8": "e",
1169 b"\xe9": "e",
1170 b"\xea": "e",
1171 b"\xeb": "e",
1172 b"\xec": "i",
1173 b"\xed": "i",
1174 b"\xee": "i",
1175 b"\xef": "i",
1176 b"\xf0": "o",
1177 b"\xf1": "n",
1178 b"\xf2": "o",
1179 b"\xf3": "o",
1180 b"\xf4": "o",
1181 b"\xf5": "o",
1182 b"\xf6": "o",
1183 b"\xf7": "/",
1184 b"\xf8": "o",
1185 b"\xf9": "u",
1186 b"\xfa": "u",
1187 b"\xfb": "u",
1188 b"\xfc": "u",
1189 b"\xfd": "y",
1190 b"\xfe": "b",
1191 b"\xff": "y",
1192 }
1194 #: A map used when removing rogue Windows-1252/ISO-8859-1
1195 #: characters in otherwise UTF-8 documents. Also used when a
1196 #: numeric character entity has been incorrectly encoded using the
1197 #: character's Windows-1252 encoding.
1198 #:
1199 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
1200 #: Windows-1252.
1201 #:
1202 #: :meta hide-value:
1203 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
1204 0x80: b"\xe2\x82\xac", # €
1205 0x82: b"\xe2\x80\x9a", # ‚
1206 0x83: b"\xc6\x92", # ƒ
1207 0x84: b"\xe2\x80\x9e", # „
1208 0x85: b"\xe2\x80\xa6", # …
1209 0x86: b"\xe2\x80\xa0", # †
1210 0x87: b"\xe2\x80\xa1", # ‡
1211 0x88: b"\xcb\x86", # ˆ
1212 0x89: b"\xe2\x80\xb0", # ‰
1213 0x8A: b"\xc5\xa0", # Š
1214 0x8B: b"\xe2\x80\xb9", # ‹
1215 0x8C: b"\xc5\x92", # Œ
1216 0x8E: b"\xc5\xbd", # Ž
1217 0x91: b"\xe2\x80\x98", # ‘
1218 0x92: b"\xe2\x80\x99", # ’
1219 0x93: b"\xe2\x80\x9c", # “
1220 0x94: b"\xe2\x80\x9d", # ”
1221 0x95: b"\xe2\x80\xa2", # •
1222 0x96: b"\xe2\x80\x93", # –
1223 0x97: b"\xe2\x80\x94", # —
1224 0x98: b"\xcb\x9c", # ˜
1225 0x99: b"\xe2\x84\xa2", # ™
1226 0x9A: b"\xc5\xa1", # š
1227 0x9B: b"\xe2\x80\xba", # ›
1228 0x9C: b"\xc5\x93", # œ
1229 0x9E: b"\xc5\xbe", # ž
1230 0x9F: b"\xc5\xb8", # Ÿ
1231 0xA0: b"\xc2\xa0", #
1232 0xA1: b"\xc2\xa1", # ¡
1233 0xA2: b"\xc2\xa2", # ¢
1234 0xA3: b"\xc2\xa3", # £
1235 0xA4: b"\xc2\xa4", # ¤
1236 0xA5: b"\xc2\xa5", # ¥
1237 0xA6: b"\xc2\xa6", # ¦
1238 0xA7: b"\xc2\xa7", # §
1239 0xA8: b"\xc2\xa8", # ¨
1240 0xA9: b"\xc2\xa9", # ©
1241 0xAA: b"\xc2\xaa", # ª
1242 0xAB: b"\xc2\xab", # «
1243 0xAC: b"\xc2\xac", # ¬
1244 0xAD: b"\xc2\xad", #
1245 0xAE: b"\xc2\xae", # ®
1246 0xAF: b"\xc2\xaf", # ¯
1247 0xB0: b"\xc2\xb0", # °
1248 0xB1: b"\xc2\xb1", # ±
1249 0xB2: b"\xc2\xb2", # ²
1250 0xB3: b"\xc2\xb3", # ³
1251 0xB4: b"\xc2\xb4", # ´
1252 0xB5: b"\xc2\xb5", # µ
1253 0xB6: b"\xc2\xb6", # ¶
1254 0xB7: b"\xc2\xb7", # ·
1255 0xB8: b"\xc2\xb8", # ¸
1256 0xB9: b"\xc2\xb9", # ¹
1257 0xBA: b"\xc2\xba", # º
1258 0xBB: b"\xc2\xbb", # »
1259 0xBC: b"\xc2\xbc", # ¼
1260 0xBD: b"\xc2\xbd", # ½
1261 0xBE: b"\xc2\xbe", # ¾
1262 0xBF: b"\xc2\xbf", # ¿
1263 0xC0: b"\xc3\x80", # À
1264 0xC1: b"\xc3\x81", # Á
1265 0xC2: b"\xc3\x82", # Â
1266 0xC3: b"\xc3\x83", # Ã
1267 0xC4: b"\xc3\x84", # Ä
1268 0xC5: b"\xc3\x85", # Å
1269 0xC6: b"\xc3\x86", # Æ
1270 0xC7: b"\xc3\x87", # Ç
1271 0xC8: b"\xc3\x88", # È
1272 0xC9: b"\xc3\x89", # É
1273 0xCA: b"\xc3\x8a", # Ê
1274 0xCB: b"\xc3\x8b", # Ë
1275 0xCC: b"\xc3\x8c", # Ì
1276 0xCD: b"\xc3\x8d", # Í
1277 0xCE: b"\xc3\x8e", # Î
1278 0xCF: b"\xc3\x8f", # Ï
1279 0xD0: b"\xc3\x90", # Ð
1280 0xD1: b"\xc3\x91", # Ñ
1281 0xD2: b"\xc3\x92", # Ò
1282 0xD3: b"\xc3\x93", # Ó
1283 0xD4: b"\xc3\x94", # Ô
1284 0xD5: b"\xc3\x95", # Õ
1285 0xD6: b"\xc3\x96", # Ö
1286 0xD7: b"\xc3\x97", # ×
1287 0xD8: b"\xc3\x98", # Ø
1288 0xD9: b"\xc3\x99", # Ù
1289 0xDA: b"\xc3\x9a", # Ú
1290 0xDB: b"\xc3\x9b", # Û
1291 0xDC: b"\xc3\x9c", # Ü
1292 0xDD: b"\xc3\x9d", # Ý
1293 0xDE: b"\xc3\x9e", # Þ
1294 0xDF: b"\xc3\x9f", # ß
1295 0xE0: b"\xc3\xa0", # à
1296 0xE1: b"\xa1", # á
1297 0xE2: b"\xc3\xa2", # â
1298 0xE3: b"\xc3\xa3", # ã
1299 0xE4: b"\xc3\xa4", # ä
1300 0xE5: b"\xc3\xa5", # å
1301 0xE6: b"\xc3\xa6", # æ
1302 0xE7: b"\xc3\xa7", # ç
1303 0xE8: b"\xc3\xa8", # è
1304 0xE9: b"\xc3\xa9", # é
1305 0xEA: b"\xc3\xaa", # ê
1306 0xEB: b"\xc3\xab", # ë
1307 0xEC: b"\xc3\xac", # ì
1308 0xED: b"\xc3\xad", # í
1309 0xEE: b"\xc3\xae", # î
1310 0xEF: b"\xc3\xaf", # ï
1311 0xF0: b"\xc3\xb0", # ð
1312 0xF1: b"\xc3\xb1", # ñ
1313 0xF2: b"\xc3\xb2", # ò
1314 0xF3: b"\xc3\xb3", # ó
1315 0xF4: b"\xc3\xb4", # ô
1316 0xF5: b"\xc3\xb5", # õ
1317 0xF6: b"\xc3\xb6", # ö
1318 0xF7: b"\xc3\xb7", # ÷
1319 0xF8: b"\xc3\xb8", # ø
1320 0xF9: b"\xc3\xb9", # ù
1321 0xFA: b"\xc3\xba", # ú
1322 0xFB: b"\xc3\xbb", # û
1323 0xFC: b"\xc3\xbc", # ü
1324 0xFD: b"\xc3\xbd", # ý
1325 0xFE: b"\xc3\xbe", # þ
1326 0xFF: b"\xc3\xbf", # ÿ
1327 }
1329 #: :meta private
1330 # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed.
1331 #
1332 # "A noncharacter is a code point that is in the range
1333 # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE,
1334 # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE,
1335 # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
1336 # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE,
1337 # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
1338 # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE,
1339 # or U+10FFFF."
1340 ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff,
1341 0x1fffe, 0x1ffff,
1342 0x2fffe, 0x2ffff,
1343 0x3fffe, 0x3ffff,
1344 0x4fffe, 0x4ffff,
1345 0x5fffe, 0x5ffff,
1346 0x6fffe, 0x6ffff,
1347 0x7fffe, 0x7ffff,
1348 0x8fffe, 0x8ffff,
1349 0x9fffe, 0x9ffff,
1350 0xafffe, 0xaffff,
1351 0xbfffe, 0xbffff,
1352 0xcfffe, 0xcffff,
1353 0xdfffe, 0xdffff,
1354 0xefffe, 0xeffff,
1355 0xffffe, 0xfffff,
1356 0x10fffe, 0x10ffff])
1358 #: :meta private:
1359 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
1360 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
1361 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
1362 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
1363 ]
1365 #: :meta private:
1366 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
1368 #: :meta private:
1369 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
1371 @classmethod
1372 def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]:
1373 """This (mostly) implements the algorithm described in "Numeric character
1374 reference end state" from the HTML spec:
1375 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
1377 The algorithm is designed to convert numeric character references like "☃"
1378 to Unicode characters like "☃".
1380 :return: A 2-tuple (character, replaced). `character` is the Unicode
1381 character corresponding to the numeric reference and `replaced` is
1382 whether or not an unresolvable character was replaced with REPLACEMENT
1383 CHARACTER.
1384 """
1385 replacement = "\ufffd"
1387 if numeric == 0x00:
1388 # "If the number is 0x00, then this is a
1389 # null-character-reference parse error. Set the character
1390 # reference code to 0xFFFD."
1391 return replacement, True
1393 if numeric > 0x10ffff:
1394 # "If the number is greater than 0x10FFFF, then this is a
1395 # character-reference-outside-unicode-range parse
1396 # error. Set the character reference code to 0xFFFD."
1397 return replacement, True
1399 if numeric >= 0xd800 and numeric <= 0xdfff:
1400 # "If the number is a surrogate, then this is a
1401 # surrogate-character-reference parse error. Set the
1402 # character reference code to 0xFFFD."
1403 return replacement, True
1405 if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS:
1406 # "If the number is a noncharacter, then this is a
1407 # noncharacter-character-reference parse error."
1408 #
1409 # "The parser resolves such character references as-is."
1410 #
1411 # I'm not sure what "as-is" means but I think it means that we act
1412 # like there was no error condition.
1413 return chr(numeric), False
1415 # "If the number is 0x0D, or a control that's not ASCII whitespace,
1416 # then this is a control-character-reference parse error."
1417 #
1418 # "A control is a C0 control or a code point in the range
1419 # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND,
1420 # inclusive."
1421 #
1422 # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive."
1423 #
1424 # "The parser resolves such character references as-is except C1 control references that are replaced."
1426 # First, let's replace the control references that can be replaced.
1427 if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8:
1428 # "If the number is one of the numbers in the first column of the
1429 # following table, then find the row with that number in the first
1430 # column, and set the character reference code to the number in the
1431 # second column of that row."
1432 #
1433 # This is an attempt to catch characters that were encoded to numeric
1434 # entities using their Windows-1252 encodings rather than their UTF-8
1435 # encodings.
1436 return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False
1438 # Now all that's left are references that should be resolved as-is. This
1439 # is also the default path for non-weird character references.
1440 try:
1441 return chr(numeric), False
1442 except (ValueError, OverflowError):
1443 # This shouldn't happen, since these cases should have been handled
1444 # above, but if it does, return REPLACEMENT CHARACTER
1445 return replacement, True
1447 @classmethod
1448 def detwingle(
1449 cls,
1450 in_bytes: bytes,
1451 main_encoding: _Encoding = "utf8",
1452 embedded_encoding: _Encoding = "windows-1252",
1453 ) -> bytes:
1454 """Fix characters from one encoding embedded in some other encoding.
1456 Currently the only situation supported is Windows-1252 (or its
1457 subset ISO-8859-1), embedded in UTF-8.
1459 :param in_bytes: A bytestring that you suspect contains
1460 characters from multiple encodings. Note that this *must*
1461 be a bytestring. If you've already converted the document
1462 to Unicode, you're too late.
1463 :param main_encoding: The primary encoding of ``in_bytes``.
1464 :param embedded_encoding: The encoding that was used to embed characters
1465 in the main document.
1466 :return: A bytestring similar to ``in_bytes``, in which
1467 ``embedded_encoding`` characters have been converted to
1468 their ``main_encoding`` equivalents.
1469 """
1470 if embedded_encoding.replace("_", "-").lower() not in (
1471 "windows-1252",
1472 "windows_1252",
1473 ):
1474 raise NotImplementedError(
1475 "Windows-1252 and ISO-8859-1 are the only currently supported "
1476 "embedded encodings."
1477 )
1479 if main_encoding.lower() not in ("utf8", "utf-8"):
1480 raise NotImplementedError(
1481 "UTF-8 is the only currently supported main encoding."
1482 )
1484 byte_chunks = []
1486 chunk_start = 0
1487 pos = 0
1488 while pos < len(in_bytes):
1489 byte = in_bytes[pos]
1490 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
1491 # This is the start of a UTF-8 multibyte character. Skip
1492 # to the end.
1493 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
1494 if byte >= start and byte <= end:
1495 pos += size
1496 break
1497 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
1498 # We found a Windows-1252 character!
1499 # Save the string up to this point as a chunk.
1500 byte_chunks.append(in_bytes[chunk_start:pos])
1502 # Now translate the Windows-1252 character into UTF-8
1503 # and add it as another, one-byte chunk.
1504 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
1505 pos += 1
1506 chunk_start = pos
1507 else:
1508 # Go on to the next character.
1509 pos += 1
1510 if chunk_start == 0:
1511 # The string is unchanged.
1512 return in_bytes
1513 else:
1514 # Store the final chunk.
1515 byte_chunks.append(in_bytes[chunk_start:])
1516 return b"".join(byte_chunks)