Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 39%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's `Universal
6Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
7by Kurt McKee. It does not rewrite the body of an XML or HTML document
8to reflect a new encoding; that's the job of `TreeBuilder`.
10"""
12# Use of this source code is governed by the MIT license.
13__license__ = "MIT"
15from html.entities import codepoint2name
16from collections import defaultdict
17import codecs
18from html.entities import html5
19import re
20from logging import Logger, getLogger
21from types import ModuleType
22from typing import (
23 Dict,
24 Iterator,
25 List,
26 Optional,
27 Pattern,
28 Set,
29 Tuple,
30 Type,
31 Union,
32 cast,
33)
34from typing_extensions import Literal
35from bs4._typing import (
36 _Encoding,
37 _Encodings,
38)
39import warnings
41# Import a library to autodetect character encodings. We'll support
42# any of a number of libraries that all support the same API:
43#
44# * cchardet
45# * chardet
46# * charset-normalizer
47chardet_module: Optional[ModuleType] = None
48try:
49 # PyPI package: cchardet
50 import cchardet
52 chardet_module = cchardet
53except ImportError:
54 try:
55 # Debian package: python-chardet
56 # PyPI package: chardet
57 import chardet
59 chardet_module = chardet
60 except ImportError:
61 try:
62 # PyPI package: charset-normalizer
63 import charset_normalizer
65 chardet_module = charset_normalizer
66 except ImportError:
67 # No chardet available.
68 pass
71def _chardet_dammit(s: bytes) -> Optional[str]:
72 """Try as hard as possible to detect the encoding of a bytestring."""
73 if chardet_module is None or isinstance(s, str):
74 return None
75 module = chardet_module
76 return module.detect(s)["encoding"]
79# Build bytestring and Unicode versions of regular expressions for finding
80# a declared encoding inside an XML or HTML document.
81xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
82html_meta: str = (
83 "<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
84)
86# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
87encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
88encoding_res[bytes] = {
89 "html": re.compile(html_meta.encode("ascii"), re.I),
90 "xml": re.compile(xml_encoding.encode("ascii"), re.I),
91}
92encoding_res[str] = {
93 "html": re.compile(html_meta, re.I),
94 "xml": re.compile(xml_encoding, re.I),
95}
98class EntitySubstitution(object):
99 """The ability to substitute XML or HTML entities for certain characters."""
101 #: A map of named HTML entities to the corresponding Unicode string.
102 #:
103 #: :meta hide-value:
104 HTML_ENTITY_TO_CHARACTER: Dict[str, str]
106 #: A map of Unicode strings to the corresponding named HTML entities;
107 #: the inverse of HTML_ENTITY_TO_CHARACTER.
108 #:
109 #: :meta hide-value:
110 CHARACTER_TO_HTML_ENTITY: Dict[str, str]
112 #: A regular expression that matches any character (or, in rare
113 #: cases, pair of characters) that can be replaced with a named
114 #: HTML entity.
115 #:
116 #: :meta hide-value:
117 CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
119 #: A very similar regular expression to
120 #: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
121 #: ampersands. This is used by the 'html' formatted to provide
122 #: backwards-compatibility, even though the HTML5 spec allows most
123 #: ampersands to go unescaped.
124 #:
125 #: :meta hide-value:
126 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
128 @classmethod
129 def _populate_class_variables(cls) -> None:
130 """Initialize variables used by this class to manage the plethora of
131 HTML5 named entities.
133 This function sets the following class variables:
135 CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
136 entity names like "angmsdaa". When a single Unicode string has
137 multiple entity names, we try to choose the most commonly-used
138 name.
140 HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
141 Unicode strings like "⦨".
143 CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
144 Unicode string that corresponds to an HTML5 named entity.
146 CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
147 regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
148 also matches unescaped ampersands. This is used by the 'html'
149 formatted to provide backwards-compatibility, even though the HTML5
150 spec allows most ampersands to go unescaped.
151 """
152 unicode_to_name = {}
153 name_to_unicode = {}
155 short_entities = set()
156 long_entities_by_first_character = defaultdict(set)
158 for name_with_semicolon, character in sorted(html5.items()):
159 # "It is intentional, for legacy compatibility, that many
160 # code points have multiple character reference names. For
161 # example, some appear both with and without the trailing
162 # semicolon, or with different capitalizations."
163 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
164 #
165 # The parsers are in charge of handling (or not) character
166 # references with no trailing semicolon, so we remove the
167 # semicolon whenever it appears.
168 if name_with_semicolon.endswith(";"):
169 name = name_with_semicolon[:-1]
170 else:
171 name = name_with_semicolon
173 # When parsing HTML, we want to recognize any known named
174 # entity and convert it to a sequence of Unicode
175 # characters.
176 if name not in name_to_unicode:
177 name_to_unicode[name] = character
179 # When _generating_ HTML, we want to recognize special
180 # character sequences that _could_ be converted to named
181 # entities.
182 unicode_to_name[character] = name
184 # We also need to build a regular expression that lets us
185 # _find_ those characters in output strings so we can
186 # replace them.
187 #
188 # This is tricky, for two reasons.
190 if len(character) == 1 and ord(character) < 128 and character not in "<>":
191 # First, it would be annoying to turn single ASCII
192 # characters like | into named entities like
193 # |. The exceptions are <>, which we _must_
194 # turn into named entities to produce valid HTML.
195 continue
197 if len(character) > 1 and all(ord(x) < 128 for x in character):
198 # We also do not want to turn _combinations_ of ASCII
199 # characters like 'fj' into named entities like 'fj',
200 # though that's more debateable.
201 continue
203 # Second, some named entities have a Unicode value that's
204 # a subset of the Unicode value for some _other_ named
205 # entity. As an example, \u2267' is ≧,
206 # but '\u2267\u0338' is ≧̸. Our regular
207 # expression needs to match the first two characters of
208 # "\u2267\u0338foo", but only the first character of
209 # "\u2267foo".
210 #
211 # In this step, we build two sets of characters that
212 # _eventually_ need to go into the regular expression. But
213 # we won't know exactly what the regular expression needs
214 # to look like until we've gone through the entire list of
215 # named entities.
216 if len(character) == 1 and character != "&":
217 short_entities.add(character)
218 else:
219 long_entities_by_first_character[character[0]].add(character)
221 # Now that we've been through the entire list of entities, we
222 # can create a regular expression that matches any of them.
223 particles = set()
224 for short in short_entities:
225 long_versions = long_entities_by_first_character[short]
226 if not long_versions:
227 particles.add(short)
228 else:
229 ignore = "".join([x[1] for x in long_versions])
230 # This finds, e.g. \u2267 but only if it is _not_
231 # followed by \u0338.
232 particles.add("%s(?![%s])" % (short, ignore))
234 for long_entities in list(long_entities_by_first_character.values()):
235 for long_entity in long_entities:
236 particles.add(long_entity)
238 re_definition = "(%s)" % "|".join(particles)
240 particles.add("&")
241 re_definition_with_ampersand = "(%s)" % "|".join(particles)
243 # If an entity shows up in both html5 and codepoint2name, it's
244 # likely that HTML5 gives it several different names, such as
245 # 'rsquo' and 'rsquor'. When converting Unicode characters to
246 # named entities, the codepoint2name name should take
247 # precedence where possible, since that's the more easily
248 # recognizable one.
249 for codepoint, name in list(codepoint2name.items()):
250 character = chr(codepoint)
251 unicode_to_name[character] = name
253 cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
254 cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
255 cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
256 cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
257 re_definition_with_ampersand
258 )
260 #: A map of Unicode strings to the corresponding named XML entities.
261 #:
262 #: :meta hide-value:
263 CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
264 "'": "apos",
265 '"': "quot",
266 "&": "amp",
267 "<": "lt",
268 ">": "gt",
269 }
271 # Matches any named or numeric HTML entity.
272 ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
274 #: A regular expression matching an angle bracket or an ampersand that
275 #: is not part of an XML or HTML entity.
276 #:
277 #: :meta hide-value:
278 BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
279 "([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
280 )
282 #: A regular expression matching an angle bracket or an ampersand.
283 #:
284 #: :meta hide-value:
285 AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
287 @classmethod
288 def _substitute_html_entity(cls, matchobj: re.Match) -> str:
289 """Used with a regular expression to substitute the
290 appropriate HTML entity for a special character string."""
291 original_entity = matchobj.group(0)
292 entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
293 if entity is None:
294 return "&%s;" % original_entity
295 return "&%s;" % entity
297 @classmethod
298 def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
299 """Used with a regular expression to substitute the
300 appropriate XML entity for a special character string."""
301 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
302 return "&%s;" % entity
304 @classmethod
305 def _escape_entity_name(cls, matchobj: re.Match) -> str:
306 return "&%s;" % matchobj.group(1)
308 @classmethod
309 def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
310 possible_entity = matchobj.group(1)
311 if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
312 return "&%s;" % possible_entity
313 return "&%s;" % possible_entity
315 @classmethod
316 def quoted_attribute_value(cls, value: str) -> str:
317 """Make a value into a quoted XML attribute, possibly escaping it.
319 Most strings will be quoted using double quotes.
321 Bob's Bar -> "Bob's Bar"
323 If a string contains double quotes, it will be quoted using
324 single quotes.
326 Welcome to "my bar" -> 'Welcome to "my bar"'
328 If a string contains both single and double quotes, the
329 double quotes will be escaped, and the string will be quoted
330 using double quotes.
332 Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
334 :param value: The XML attribute value to quote
335 :return: The quoted value
336 """
337 quote_with = '"'
338 if '"' in value:
339 if "'" in value:
340 # The string contains both single and double
341 # quotes. Turn the double quotes into
342 # entities. We quote the double quotes rather than
343 # the single quotes because the entity name is
344 # """ whether this is HTML or XML. If we
345 # quoted the single quotes, we'd have to decide
346 # between ' and &squot;.
347 replace_with = """
348 value = value.replace('"', replace_with)
349 else:
350 # There are double quotes but no single quotes.
351 # We can use single quotes to quote the attribute.
352 quote_with = "'"
353 return quote_with + value + quote_with
355 @classmethod
356 def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
357 """Replace special XML characters with named XML entities.
359 The less-than sign will become <, the greater-than sign
360 will become >, and any ampersands will become &. If you
361 want ampersands that seem to be part of an entity definition
362 to be left alone, use `substitute_xml_containing_entities`
363 instead.
365 :param value: A string to be substituted.
367 :param make_quoted_attribute: If True, then the string will be
368 quoted, as befits an attribute value.
370 :return: A version of ``value`` with special characters replaced
371 with named entities.
372 """
373 # Escape angle brackets and ampersands.
374 value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
376 if make_quoted_attribute:
377 value = cls.quoted_attribute_value(value)
378 return value
380 @classmethod
381 def substitute_xml_containing_entities(
382 cls, value: str, make_quoted_attribute: bool = False
383 ) -> str:
384 """Substitute XML entities for special XML characters.
386 :param value: A string to be substituted. The less-than sign will
387 become <, the greater-than sign will become >, and any
388 ampersands that are not part of an entity defition will
389 become &.
391 :param make_quoted_attribute: If True, then the string will be
392 quoted, as befits an attribute value.
393 """
394 # Escape angle brackets, and ampersands that aren't part of
395 # entities.
396 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
398 if make_quoted_attribute:
399 value = cls.quoted_attribute_value(value)
400 return value
402 @classmethod
403 def substitute_html(cls, s: str) -> str:
404 """Replace certain Unicode characters with named HTML entities.
406 This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
407 in that the goal is to make the result more readable (to those
408 with ASCII displays) rather than to recover from
409 errors. There's absolutely nothing wrong with a UTF-8 string
410 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
411 character with "é" will make it more readable to some
412 people.
414 :param s: The string to be modified.
415 :return: The string with some Unicode characters replaced with
416 HTML entities.
417 """
418 # Convert any appropriate characters to HTML entities.
419 return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
420 cls._substitute_html_entity, s
421 )
423 @classmethod
424 def substitute_html5(cls, s: str) -> str:
425 """Replace certain Unicode characters with named HTML entities
426 using HTML5 rules.
428 Specifically, this method is much less aggressive about
429 escaping ampersands than substitute_html. Only ambiguous
430 ampersands are escaped, per the HTML5 standard:
432 "An ambiguous ampersand is a U+0026 AMPERSAND character (&)
433 that is followed by one or more ASCII alphanumerics, followed
434 by a U+003B SEMICOLON character (;), where these characters do
435 not match any of the names given in the named character
436 references section."
438 Unlike substitute_html5_raw, this method assumes HTML entities
439 were converted to Unicode characters on the way in, as
440 Beautiful Soup does. By the time Beautiful Soup does its work,
441 the only ambiguous ampersands that need to be escaped are the
442 ones that were escaped in the original markup when mentioning
443 HTML entities.
445 :param s: The string to be modified.
446 :return: The string with some Unicode characters replaced with
447 HTML entities.
448 """
449 # First, escape any HTML entities found in the markup.
450 s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
452 # Next, convert any appropriate characters to unescaped HTML entities.
453 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
455 return s
457 @classmethod
458 def substitute_html5_raw(cls, s: str) -> str:
459 """Replace certain Unicode characters with named HTML entities
460 using HTML5 rules.
462 substitute_html5_raw is similar to substitute_html5 but it is
463 designed for standalone use (whereas substitute_html5 is
464 designed for use with Beautiful Soup).
466 :param s: The string to be modified.
467 :return: The string with some Unicode characters replaced with
468 HTML entities.
469 """
470 # First, escape the ampersand for anything that looks like an
471 # entity but isn't in the list of recognized entities. All other
472 # ampersands can be left alone.
473 s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
475 # Then, convert a range of Unicode characters to unescaped
476 # HTML entities.
477 s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
479 return s
482EntitySubstitution._populate_class_variables()
485class EncodingDetector:
486 """This class is capable of guessing a number of possible encodings
487 for a bytestring.
489 Order of precedence:
491 1. Encodings you specifically tell EncodingDetector to try first
492 (the ``known_definite_encodings`` argument to the constructor).
494 2. An encoding determined by sniffing the document's byte-order mark.
496 3. Encodings you specifically tell EncodingDetector to try if
497 byte-order mark sniffing fails (the ``user_encodings`` argument to the
498 constructor).
500 4. An encoding declared within the bytestring itself, either in an
501 XML declaration (if the bytestring is to be interpreted as an XML
502 document), or in a <meta> tag (if the bytestring is to be
503 interpreted as an HTML document.)
505 5. An encoding detected through textual analysis by chardet,
506 cchardet, or a similar external library.
508 6. UTF-8.
510 7. Windows-1252.
512 :param markup: Some markup in an unknown encoding.
514 :param known_definite_encodings: When determining the encoding
515 of ``markup``, these encodings will be tried first, in
516 order. In HTML terms, this corresponds to the "known
517 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
519 :param user_encodings: These encodings will be tried after the
520 ``known_definite_encodings`` have been tried and failed, and
521 after an attempt to sniff the encoding by looking at a
522 byte order mark has failed. In HTML terms, this
523 corresponds to the step "user has explicitly instructed
524 the user agent to override the document's character
525 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
527 :param override_encodings: A **deprecated** alias for
528 ``known_definite_encodings``. Any encodings here will be tried
529 immediately after the encodings in
530 ``known_definite_encodings``.
532 :param is_html: If True, this markup is considered to be
533 HTML. Otherwise it's assumed to be XML.
535 :param exclude_encodings: These encodings will not be tried,
536 even if they otherwise would be.
538 """
540 def __init__(
541 self,
542 markup: bytes,
543 known_definite_encodings: Optional[_Encodings] = None,
544 is_html: Optional[bool] = False,
545 exclude_encodings: Optional[_Encodings] = None,
546 user_encodings: Optional[_Encodings] = None,
547 override_encodings: Optional[_Encodings] = None,
548 ):
549 self.known_definite_encodings = list(known_definite_encodings or [])
550 if override_encodings:
551 warnings.warn(
552 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
553 DeprecationWarning,
554 stacklevel=3,
555 )
556 self.known_definite_encodings += override_encodings
557 self.user_encodings = user_encodings or []
558 exclude_encodings = exclude_encodings or []
559 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
560 self.chardet_encoding = None
561 self.is_html = False if is_html is None else is_html
562 self.declared_encoding: Optional[str] = None
564 # First order of business: strip a byte-order mark.
565 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
567 known_definite_encodings: _Encodings
568 user_encodings: _Encodings
569 exclude_encodings: _Encodings
570 chardet_encoding: Optional[_Encoding]
571 is_html: bool
572 declared_encoding: Optional[_Encoding]
573 markup: bytes
574 sniffed_encoding: Optional[_Encoding]
576 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
577 """Should we even bother to try this encoding?
579 :param encoding: Name of an encoding.
580 :param tried: Encodings that have already been tried. This
581 will be modified as a side effect.
582 """
583 if encoding is None:
584 return False
585 encoding = encoding.lower()
586 if encoding in self.exclude_encodings:
587 return False
588 if encoding not in tried:
589 tried.add(encoding)
590 return True
591 return False
593 @property
594 def encodings(self) -> Iterator[_Encoding]:
595 """Yield a number of encodings that might work for this markup.
597 :yield: A sequence of strings. Each is the name of an encoding
598 that *might* work to convert a bytestring into Unicode.
599 """
600 tried: Set[_Encoding] = set()
602 # First, try the known definite encodings
603 for e in self.known_definite_encodings:
604 if self._usable(e, tried):
605 yield e
607 # Did the document originally start with a byte-order mark
608 # that indicated its encoding?
609 if self.sniffed_encoding is not None and self._usable(
610 self.sniffed_encoding, tried
611 ):
612 yield self.sniffed_encoding
614 # Sniffing the byte-order mark did nothing; try the user
615 # encodings.
616 for e in self.user_encodings:
617 if self._usable(e, tried):
618 yield e
620 # Look within the document for an XML or HTML encoding
621 # declaration.
622 if self.declared_encoding is None:
623 self.declared_encoding = self.find_declared_encoding(
624 self.markup, self.is_html
625 )
626 if self.declared_encoding is not None and self._usable(
627 self.declared_encoding, tried
628 ):
629 yield self.declared_encoding
631 # Use third-party character set detection to guess at the
632 # encoding.
633 if self.chardet_encoding is None:
634 self.chardet_encoding = _chardet_dammit(self.markup)
635 if self.chardet_encoding is not None and self._usable(
636 self.chardet_encoding, tried
637 ):
638 yield self.chardet_encoding
640 # As a last-ditch effort, try utf-8 and windows-1252.
641 for e in ("utf-8", "windows-1252"):
642 if self._usable(e, tried):
643 yield e
645 @classmethod
646 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
647 """If a byte-order mark is present, strip it and return the encoding it implies.
649 :param data: A bytestring that may or may not begin with a
650 byte-order mark.
652 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
653 """
654 encoding = None
655 if isinstance(data, str):
656 # Unicode data cannot have a byte-order mark.
657 return data, encoding
658 if (
659 (len(data) >= 4)
660 and (data[:2] == b"\xfe\xff")
661 and (data[2:4] != b"\x00\x00")
662 ):
663 encoding = "utf-16be"
664 data = data[2:]
665 elif (
666 (len(data) >= 4)
667 and (data[:2] == b"\xff\xfe")
668 and (data[2:4] != b"\x00\x00")
669 ):
670 encoding = "utf-16le"
671 data = data[2:]
672 elif data[:3] == b"\xef\xbb\xbf":
673 encoding = "utf-8"
674 data = data[3:]
675 elif data[:4] == b"\x00\x00\xfe\xff":
676 encoding = "utf-32be"
677 data = data[4:]
678 elif data[:4] == b"\xff\xfe\x00\x00":
679 encoding = "utf-32le"
680 data = data[4:]
681 return data, encoding
683 @classmethod
684 def find_declared_encoding(
685 cls,
686 markup: Union[bytes, str],
687 is_html: bool = False,
688 search_entire_document: bool = False,
689 ) -> Optional[_Encoding]:
690 """Given a document, tries to find an encoding declared within the
691 text of the document itself.
693 An XML encoding is declared at the beginning of the document.
695 An HTML encoding is declared in a <meta> tag, hopefully near the
696 beginning of the document.
698 :param markup: Some markup.
699 :param is_html: If True, this markup is considered to be HTML. Otherwise
700 it's assumed to be XML.
701 :param search_entire_document: Since an encoding is supposed
702 to declared near the beginning of the document, most of
703 the time it's only necessary to search a few kilobytes of
704 data. Set this to True to force this method to search the
705 entire document.
706 :return: The declared encoding, if one is found.
707 """
708 if search_entire_document:
709 xml_endpos = html_endpos = len(markup)
710 else:
711 xml_endpos = 1024
712 html_endpos = max(2048, int(len(markup) * 0.05))
714 if isinstance(markup, bytes):
715 res = encoding_res[bytes]
716 else:
717 res = encoding_res[str]
719 xml_re = res["xml"]
720 html_re = res["html"]
721 declared_encoding: Optional[_Encoding] = None
722 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
723 if not declared_encoding_match and is_html:
724 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
725 if declared_encoding_match is not None:
726 declared_encoding = declared_encoding_match.groups()[0]
727 if declared_encoding:
728 if isinstance(declared_encoding, bytes):
729 declared_encoding = declared_encoding.decode("ascii", "replace")
730 return declared_encoding.lower()
731 return None
734class UnicodeDammit:
735 """A class for detecting the encoding of a bytestring containing an
736 HTML or XML document, and decoding it to Unicode. If the source
737 encoding is windows-1252, `UnicodeDammit` can also replace
738 Microsoft smart quotes with their HTML or XML equivalents.
740 :param markup: HTML or XML markup in an unknown encoding.
742 :param known_definite_encodings: When determining the encoding
743 of ``markup``, these encodings will be tried first, in
744 order. In HTML terms, this corresponds to the "known
745 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
747 :param user_encodings: These encodings will be tried after the
748 ``known_definite_encodings`` have been tried and failed, and
749 after an attempt to sniff the encoding by looking at a
750 byte order mark has failed. In HTML terms, this
751 corresponds to the step "user has explicitly instructed
752 the user agent to override the document's character
753 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
755 :param override_encodings: A **deprecated** alias for
756 ``known_definite_encodings``. Any encodings here will be tried
757 immediately after the encodings in
758 ``known_definite_encodings``.
760 :param smart_quotes_to: By default, Microsoft smart quotes will,
761 like all other characters, be converted to Unicode
762 characters. Setting this to ``ascii`` will convert them to ASCII
763 quotes instead. Setting it to ``xml`` will convert them to XML
764 entity references, and setting it to ``html`` will convert them
765 to HTML entity references.
767 :param is_html: If True, ``markup`` is treated as an HTML
768 document. Otherwise it's treated as an XML document.
770 :param exclude_encodings: These encodings will not be considered,
771 even if the sniffing code thinks they might make sense.
773 """
775 def __init__(
776 self,
777 markup: bytes,
778 known_definite_encodings: Optional[_Encodings] = [],
779 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
780 is_html: bool = False,
781 exclude_encodings: Optional[_Encodings] = [],
782 user_encodings: Optional[_Encodings] = None,
783 override_encodings: Optional[_Encodings] = None,
784 ):
785 self.smart_quotes_to = smart_quotes_to
786 self.tried_encodings = []
787 self.contains_replacement_characters = False
788 self.is_html = is_html
789 self.log = getLogger(__name__)
790 self.detector = EncodingDetector(
791 markup,
792 known_definite_encodings,
793 is_html,
794 exclude_encodings,
795 user_encodings,
796 override_encodings,
797 )
799 # Short-circuit if the data is in Unicode to begin with.
800 if isinstance(markup, str) or markup == b"":
801 self.markup = markup
802 self.unicode_markup = str(markup)
803 self.original_encoding = None
804 return
806 # The encoding detector may have stripped a byte-order mark.
807 # Use the stripped markup from this point on.
808 self.markup = self.detector.markup
810 u = None
811 for encoding in self.detector.encodings:
812 markup = self.detector.markup
813 u = self._convert_from(encoding)
814 if u is not None:
815 break
817 if not u:
818 # None of the encodings worked. As an absolute last resort,
819 # try them again with character replacement.
821 for encoding in self.detector.encodings:
822 if encoding != "ascii":
823 u = self._convert_from(encoding, "replace")
824 if u is not None:
825 self.log.warning(
826 "Some characters could not be decoded, and were "
827 "replaced with REPLACEMENT CHARACTER."
828 )
830 self.contains_replacement_characters = True
831 break
833 # If none of that worked, we could at this point force it to
834 # ASCII, but that would destroy so much data that I think
835 # giving up is better.
836 #
837 # Note that this is extremely unlikely, probably impossible,
838 # because the "replace" strategy is so powerful. Even running
839 # the Python binary through Unicode, Dammit gives you Unicode,
840 # albeit Unicode riddled with REPLACEMENT CHARACTER.
841 if u is None:
842 self.original_encoding = None
843 self.unicode_markup = None
844 else:
845 self.unicode_markup = u
847 #: The original markup, before it was converted to Unicode.
848 #: This is not necessarily the same as what was passed in to the
849 #: constructor, since any byte-order mark will be stripped.
850 markup: bytes
852 #: The Unicode version of the markup, following conversion. This
853 #: is set to None if there was simply no way to convert the
854 #: bytestring to Unicode (as with binary data).
855 unicode_markup: Optional[str]
857 #: This is True if `UnicodeDammit.unicode_markup` contains
858 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
859 #: in `UnicodeDammit.markup`. These mark character sequences that
860 #: could not be represented in Unicode.
861 contains_replacement_characters: bool
863 #: Unicode, Dammit's best guess as to the original character
864 #: encoding of `UnicodeDammit.markup`.
865 original_encoding: Optional[_Encoding]
867 #: The strategy used to handle Microsoft smart quotes.
868 smart_quotes_to: Optional[str]
870 #: The (encoding, error handling strategy) 2-tuples that were used to
871 #: try and convert the markup to Unicode.
872 tried_encodings: List[Tuple[_Encoding, str]]
874 log: Logger #: :meta private:
876 def _sub_ms_char(self, match: re.Match) -> bytes:
877 """Changes a MS smart quote character to an XML or HTML
878 entity, or an ASCII character.
880 TODO: Since this is only used to convert smart quotes, it
881 could be simplified, and MS_CHARS_TO_ASCII made much less
882 parochial.
883 """
884 orig: bytes = match.group(1)
885 sub: bytes
886 if self.smart_quotes_to == "ascii":
887 if orig in self.MS_CHARS_TO_ASCII:
888 sub = self.MS_CHARS_TO_ASCII[orig].encode()
889 else:
890 # Shouldn't happen; substitute the character
891 # with itself.
892 sub = orig
893 else:
894 if orig in self.MS_CHARS:
895 substitutions = self.MS_CHARS[orig]
896 if type(substitutions) is tuple:
897 if self.smart_quotes_to == "xml":
898 sub = b"&#x" + substitutions[1].encode() + b";"
899 else:
900 sub = b"&" + substitutions[0].encode() + b";"
901 else:
902 substitutions = cast(str, substitutions)
903 sub = substitutions.encode()
904 else:
905 # Shouldn't happen; substitute the character
906 # for itself.
907 sub = orig
908 return sub
910 #: This dictionary maps commonly seen values for "charset" in HTML
911 #: meta tags to the corresponding Python codec names. It only covers
912 #: values that aren't in Python's aliases and can't be determined
913 #: by the heuristics in `find_codec`.
914 #:
915 #: :meta hide-value:
916 CHARSET_ALIASES: Dict[str, _Encoding] = {
917 "macintosh": "mac-roman",
918 "x-sjis": "shift-jis",
919 }
921 #: A list of encodings that tend to contain Microsoft smart quotes.
922 #:
923 #: :meta hide-value:
924 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
925 "windows-1252",
926 "iso-8859-1",
927 "iso-8859-2",
928 ]
930 def _convert_from(
931 self, proposed: _Encoding, errors: str = "strict"
932 ) -> Optional[str]:
933 """Attempt to convert the markup to the proposed encoding.
935 :param proposed: The name of a character encoding.
936 :param errors: An error handling strategy, used when calling `str`.
937 :return: The converted markup, or `None` if the proposed
938 encoding/error handling strategy didn't work.
939 """
940 lookup_result = self.find_codec(proposed)
941 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
942 return None
943 proposed = lookup_result
944 self.tried_encodings.append((proposed, errors))
945 markup = self.markup
946 # Convert smart quotes to HTML if coming from an encoding
947 # that might have them.
948 if (
949 self.smart_quotes_to is not None
950 and proposed in self.ENCODINGS_WITH_SMART_QUOTES
951 ):
952 smart_quotes_re = b"([\x80-\x9f])"
953 smart_quotes_compiled = re.compile(smart_quotes_re)
954 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
956 try:
957 # print("Trying to convert document to %s (errors=%s)" % (
958 # proposed, errors))
959 u = self._to_unicode(markup, proposed, errors)
960 self.unicode_markup = u
961 self.original_encoding = proposed
962 except Exception:
963 # print("That didn't work!")
964 # print(e)
965 return None
966 # print("Correct encoding: %s" % proposed)
967 return self.unicode_markup
969 def _to_unicode(
970 self, data: bytes, encoding: _Encoding, errors: str = "strict"
971 ) -> str:
972 """Given a bytestring and its encoding, decodes the string into Unicode.
974 :param encoding: The name of an encoding.
975 :param errors: An error handling strategy, used when calling `str`.
976 """
977 return str(data, encoding, errors)
979 @property
980 def declared_html_encoding(self) -> Optional[_Encoding]:
981 """If the markup is an HTML document, returns the encoding, if any,
982 declared *inside* the document.
983 """
984 if not self.is_html:
985 return None
986 return self.detector.declared_encoding
988 def find_codec(self, charset: _Encoding) -> Optional[str]:
989 """Look up the Python codec corresponding to a given character set.
991 :param charset: The name of a character set.
992 :return: The name of a Python codec.
993 """
994 value = (
995 self._codec(self.CHARSET_ALIASES.get(charset, charset))
996 or (charset and self._codec(charset.replace("-", "")))
997 or (charset and self._codec(charset.replace("-", "_")))
998 or (charset and charset.lower())
999 or charset
1000 )
1001 if value:
1002 return value.lower()
1003 return None
1005 def _codec(self, charset: _Encoding) -> Optional[str]:
1006 if not charset:
1007 return charset
1008 codec = None
1009 try:
1010 codecs.lookup(charset)
1011 codec = charset
1012 except (LookupError, ValueError):
1013 pass
1014 return codec
1016 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
1017 #:
1018 #: :meta hide-value:
1019 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
1020 b"\x80": ("euro", "20AC"),
1021 b"\x81": " ",
1022 b"\x82": ("sbquo", "201A"),
1023 b"\x83": ("fnof", "192"),
1024 b"\x84": ("bdquo", "201E"),
1025 b"\x85": ("hellip", "2026"),
1026 b"\x86": ("dagger", "2020"),
1027 b"\x87": ("Dagger", "2021"),
1028 b"\x88": ("circ", "2C6"),
1029 b"\x89": ("permil", "2030"),
1030 b"\x8a": ("Scaron", "160"),
1031 b"\x8b": ("lsaquo", "2039"),
1032 b"\x8c": ("OElig", "152"),
1033 b"\x8d": "?",
1034 b"\x8e": ("#x17D", "17D"),
1035 b"\x8f": "?",
1036 b"\x90": "?",
1037 b"\x91": ("lsquo", "2018"),
1038 b"\x92": ("rsquo", "2019"),
1039 b"\x93": ("ldquo", "201C"),
1040 b"\x94": ("rdquo", "201D"),
1041 b"\x95": ("bull", "2022"),
1042 b"\x96": ("ndash", "2013"),
1043 b"\x97": ("mdash", "2014"),
1044 b"\x98": ("tilde", "2DC"),
1045 b"\x99": ("trade", "2122"),
1046 b"\x9a": ("scaron", "161"),
1047 b"\x9b": ("rsaquo", "203A"),
1048 b"\x9c": ("oelig", "153"),
1049 b"\x9d": "?",
1050 b"\x9e": ("#x17E", "17E"),
1051 b"\x9f": ("Yuml", ""),
1052 }
1054 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
1055 #: horrors like stripping diacritical marks to turn á into a, but also
1056 #: contains non-horrors like turning “ into ".
1057 #:
1058 #: Seriously, don't use this for anything other than removing smart
1059 #: quotes.
1060 #:
1061 #: :meta private:
1062 MS_CHARS_TO_ASCII: Dict[bytes, str] = {
1063 b"\x80": "EUR",
1064 b"\x81": " ",
1065 b"\x82": ",",
1066 b"\x83": "f",
1067 b"\x84": ",,",
1068 b"\x85": "...",
1069 b"\x86": "+",
1070 b"\x87": "++",
1071 b"\x88": "^",
1072 b"\x89": "%",
1073 b"\x8a": "S",
1074 b"\x8b": "<",
1075 b"\x8c": "OE",
1076 b"\x8d": "?",
1077 b"\x8e": "Z",
1078 b"\x8f": "?",
1079 b"\x90": "?",
1080 b"\x91": "'",
1081 b"\x92": "'",
1082 b"\x93": '"',
1083 b"\x94": '"',
1084 b"\x95": "*",
1085 b"\x96": "-",
1086 b"\x97": "--",
1087 b"\x98": "~",
1088 b"\x99": "(TM)",
1089 b"\x9a": "s",
1090 b"\x9b": ">",
1091 b"\x9c": "oe",
1092 b"\x9d": "?",
1093 b"\x9e": "z",
1094 b"\x9f": "Y",
1095 b"\xa0": " ",
1096 b"\xa1": "!",
1097 b"\xa2": "c",
1098 b"\xa3": "GBP",
1099 b"\xa4": "$", # This approximation is especially parochial--this is the
1100 # generic currency symbol.
1101 b"\xa5": "YEN",
1102 b"\xa6": "|",
1103 b"\xa7": "S",
1104 b"\xa8": "..",
1105 b"\xa9": "",
1106 b"\xaa": "(th)",
1107 b"\xab": "<<",
1108 b"\xac": "!",
1109 b"\xad": " ",
1110 b"\xae": "(R)",
1111 b"\xaf": "-",
1112 b"\xb0": "o",
1113 b"\xb1": "+-",
1114 b"\xb2": "2",
1115 b"\xb3": "3",
1116 b"\xb4": "'",
1117 b"\xb5": "u",
1118 b"\xb6": "P",
1119 b"\xb7": "*",
1120 b"\xb8": ",",
1121 b"\xb9": "1",
1122 b"\xba": "(th)",
1123 b"\xbb": ">>",
1124 b"\xbc": "1/4",
1125 b"\xbd": "1/2",
1126 b"\xbe": "3/4",
1127 b"\xbf": "?",
1128 b"\xc0": "A",
1129 b"\xc1": "A",
1130 b"\xc2": "A",
1131 b"\xc3": "A",
1132 b"\xc4": "A",
1133 b"\xc5": "A",
1134 b"\xc6": "AE",
1135 b"\xc7": "C",
1136 b"\xc8": "E",
1137 b"\xc9": "E",
1138 b"\xca": "E",
1139 b"\xcb": "E",
1140 b"\xcc": "I",
1141 b"\xcd": "I",
1142 b"\xce": "I",
1143 b"\xcf": "I",
1144 b"\xd0": "D",
1145 b"\xd1": "N",
1146 b"\xd2": "O",
1147 b"\xd3": "O",
1148 b"\xd4": "O",
1149 b"\xd5": "O",
1150 b"\xd6": "O",
1151 b"\xd7": "*",
1152 b"\xd8": "O",
1153 b"\xd9": "U",
1154 b"\xda": "U",
1155 b"\xdb": "U",
1156 b"\xdc": "U",
1157 b"\xdd": "Y",
1158 b"\xde": "b",
1159 b"\xdf": "B",
1160 b"\xe0": "a",
1161 b"\xe1": "a",
1162 b"\xe2": "a",
1163 b"\xe3": "a",
1164 b"\xe4": "a",
1165 b"\xe5": "a",
1166 b"\xe6": "ae",
1167 b"\xe7": "c",
1168 b"\xe8": "e",
1169 b"\xe9": "e",
1170 b"\xea": "e",
1171 b"\xeb": "e",
1172 b"\xec": "i",
1173 b"\xed": "i",
1174 b"\xee": "i",
1175 b"\xef": "i",
1176 b"\xf0": "o",
1177 b"\xf1": "n",
1178 b"\xf2": "o",
1179 b"\xf3": "o",
1180 b"\xf4": "o",
1181 b"\xf5": "o",
1182 b"\xf6": "o",
1183 b"\xf7": "/",
1184 b"\xf8": "o",
1185 b"\xf9": "u",
1186 b"\xfa": "u",
1187 b"\xfb": "u",
1188 b"\xfc": "u",
1189 b"\xfd": "y",
1190 b"\xfe": "b",
1191 b"\xff": "y",
1192 }
1194 #: A map used when removing rogue Windows-1252/ISO-8859-1
1195 #: characters in otherwise UTF-8 documents.
1196 #:
1197 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
1198 #: Windows-1252.
1199 #:
1200 #: :meta hide-value:
1201 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
1202 0x80: b"\xe2\x82\xac", # €
1203 0x82: b"\xe2\x80\x9a", # ‚
1204 0x83: b"\xc6\x92", # ƒ
1205 0x84: b"\xe2\x80\x9e", # „
1206 0x85: b"\xe2\x80\xa6", # …
1207 0x86: b"\xe2\x80\xa0", # †
1208 0x87: b"\xe2\x80\xa1", # ‡
1209 0x88: b"\xcb\x86", # ˆ
1210 0x89: b"\xe2\x80\xb0", # ‰
1211 0x8A: b"\xc5\xa0", # Š
1212 0x8B: b"\xe2\x80\xb9", # ‹
1213 0x8C: b"\xc5\x92", # Œ
1214 0x8E: b"\xc5\xbd", # Ž
1215 0x91: b"\xe2\x80\x98", # ‘
1216 0x92: b"\xe2\x80\x99", # ’
1217 0x93: b"\xe2\x80\x9c", # “
1218 0x94: b"\xe2\x80\x9d", # ”
1219 0x95: b"\xe2\x80\xa2", # •
1220 0x96: b"\xe2\x80\x93", # –
1221 0x97: b"\xe2\x80\x94", # —
1222 0x98: b"\xcb\x9c", # ˜
1223 0x99: b"\xe2\x84\xa2", # ™
1224 0x9A: b"\xc5\xa1", # š
1225 0x9B: b"\xe2\x80\xba", # ›
1226 0x9C: b"\xc5\x93", # œ
1227 0x9E: b"\xc5\xbe", # ž
1228 0x9F: b"\xc5\xb8", # Ÿ
1229 0xA0: b"\xc2\xa0", #
1230 0xA1: b"\xc2\xa1", # ¡
1231 0xA2: b"\xc2\xa2", # ¢
1232 0xA3: b"\xc2\xa3", # £
1233 0xA4: b"\xc2\xa4", # ¤
1234 0xA5: b"\xc2\xa5", # ¥
1235 0xA6: b"\xc2\xa6", # ¦
1236 0xA7: b"\xc2\xa7", # §
1237 0xA8: b"\xc2\xa8", # ¨
1238 0xA9: b"\xc2\xa9", # ©
1239 0xAA: b"\xc2\xaa", # ª
1240 0xAB: b"\xc2\xab", # «
1241 0xAC: b"\xc2\xac", # ¬
1242 0xAD: b"\xc2\xad", #
1243 0xAE: b"\xc2\xae", # ®
1244 0xAF: b"\xc2\xaf", # ¯
1245 0xB0: b"\xc2\xb0", # °
1246 0xB1: b"\xc2\xb1", # ±
1247 0xB2: b"\xc2\xb2", # ²
1248 0xB3: b"\xc2\xb3", # ³
1249 0xB4: b"\xc2\xb4", # ´
1250 0xB5: b"\xc2\xb5", # µ
1251 0xB6: b"\xc2\xb6", # ¶
1252 0xB7: b"\xc2\xb7", # ·
1253 0xB8: b"\xc2\xb8", # ¸
1254 0xB9: b"\xc2\xb9", # ¹
1255 0xBA: b"\xc2\xba", # º
1256 0xBB: b"\xc2\xbb", # »
1257 0xBC: b"\xc2\xbc", # ¼
1258 0xBD: b"\xc2\xbd", # ½
1259 0xBE: b"\xc2\xbe", # ¾
1260 0xBF: b"\xc2\xbf", # ¿
1261 0xC0: b"\xc3\x80", # À
1262 0xC1: b"\xc3\x81", # Á
1263 0xC2: b"\xc3\x82", # Â
1264 0xC3: b"\xc3\x83", # Ã
1265 0xC4: b"\xc3\x84", # Ä
1266 0xC5: b"\xc3\x85", # Å
1267 0xC6: b"\xc3\x86", # Æ
1268 0xC7: b"\xc3\x87", # Ç
1269 0xC8: b"\xc3\x88", # È
1270 0xC9: b"\xc3\x89", # É
1271 0xCA: b"\xc3\x8a", # Ê
1272 0xCB: b"\xc3\x8b", # Ë
1273 0xCC: b"\xc3\x8c", # Ì
1274 0xCD: b"\xc3\x8d", # Í
1275 0xCE: b"\xc3\x8e", # Î
1276 0xCF: b"\xc3\x8f", # Ï
1277 0xD0: b"\xc3\x90", # Ð
1278 0xD1: b"\xc3\x91", # Ñ
1279 0xD2: b"\xc3\x92", # Ò
1280 0xD3: b"\xc3\x93", # Ó
1281 0xD4: b"\xc3\x94", # Ô
1282 0xD5: b"\xc3\x95", # Õ
1283 0xD6: b"\xc3\x96", # Ö
1284 0xD7: b"\xc3\x97", # ×
1285 0xD8: b"\xc3\x98", # Ø
1286 0xD9: b"\xc3\x99", # Ù
1287 0xDA: b"\xc3\x9a", # Ú
1288 0xDB: b"\xc3\x9b", # Û
1289 0xDC: b"\xc3\x9c", # Ü
1290 0xDD: b"\xc3\x9d", # Ý
1291 0xDE: b"\xc3\x9e", # Þ
1292 0xDF: b"\xc3\x9f", # ß
1293 0xE0: b"\xc3\xa0", # à
1294 0xE1: b"\xa1", # á
1295 0xE2: b"\xc3\xa2", # â
1296 0xE3: b"\xc3\xa3", # ã
1297 0xE4: b"\xc3\xa4", # ä
1298 0xE5: b"\xc3\xa5", # å
1299 0xE6: b"\xc3\xa6", # æ
1300 0xE7: b"\xc3\xa7", # ç
1301 0xE8: b"\xc3\xa8", # è
1302 0xE9: b"\xc3\xa9", # é
1303 0xEA: b"\xc3\xaa", # ê
1304 0xEB: b"\xc3\xab", # ë
1305 0xEC: b"\xc3\xac", # ì
1306 0xED: b"\xc3\xad", # í
1307 0xEE: b"\xc3\xae", # î
1308 0xEF: b"\xc3\xaf", # ï
1309 0xF0: b"\xc3\xb0", # ð
1310 0xF1: b"\xc3\xb1", # ñ
1311 0xF2: b"\xc3\xb2", # ò
1312 0xF3: b"\xc3\xb3", # ó
1313 0xF4: b"\xc3\xb4", # ô
1314 0xF5: b"\xc3\xb5", # õ
1315 0xF6: b"\xc3\xb6", # ö
1316 0xF7: b"\xc3\xb7", # ÷
1317 0xF8: b"\xc3\xb8", # ø
1318 0xF9: b"\xc3\xb9", # ù
1319 0xFA: b"\xc3\xba", # ú
1320 0xFB: b"\xc3\xbb", # û
1321 0xFC: b"\xc3\xbc", # ü
1322 0xFD: b"\xc3\xbd", # ý
1323 0xFE: b"\xc3\xbe", # þ
1324 }
1326 #: :meta private:
1327 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
1328 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
1329 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
1330 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
1331 ]
1333 #: :meta private:
1334 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
1336 #: :meta private:
1337 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
1339 @classmethod
1340 def detwingle(
1341 cls,
1342 in_bytes: bytes,
1343 main_encoding: _Encoding = "utf8",
1344 embedded_encoding: _Encoding = "windows-1252",
1345 ) -> bytes:
1346 """Fix characters from one encoding embedded in some other encoding.
1348 Currently the only situation supported is Windows-1252 (or its
1349 subset ISO-8859-1), embedded in UTF-8.
1351 :param in_bytes: A bytestring that you suspect contains
1352 characters from multiple encodings. Note that this *must*
1353 be a bytestring. If you've already converted the document
1354 to Unicode, you're too late.
1355 :param main_encoding: The primary encoding of ``in_bytes``.
1356 :param embedded_encoding: The encoding that was used to embed characters
1357 in the main document.
1358 :return: A bytestring similar to ``in_bytes``, in which
1359 ``embedded_encoding`` characters have been converted to
1360 their ``main_encoding`` equivalents.
1361 """
1362 if embedded_encoding.replace("_", "-").lower() not in (
1363 "windows-1252",
1364 "windows_1252",
1365 ):
1366 raise NotImplementedError(
1367 "Windows-1252 and ISO-8859-1 are the only currently supported "
1368 "embedded encodings."
1369 )
1371 if main_encoding.lower() not in ("utf8", "utf-8"):
1372 raise NotImplementedError(
1373 "UTF-8 is the only currently supported main encoding."
1374 )
1376 byte_chunks = []
1378 chunk_start = 0
1379 pos = 0
1380 while pos < len(in_bytes):
1381 byte = in_bytes[pos]
1382 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
1383 # This is the start of a UTF-8 multibyte character. Skip
1384 # to the end.
1385 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
1386 if byte >= start and byte <= end:
1387 pos += size
1388 break
1389 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
1390 # We found a Windows-1252 character!
1391 # Save the string up to this point as a chunk.
1392 byte_chunks.append(in_bytes[chunk_start:pos])
1394 # Now translate the Windows-1252 character into UTF-8
1395 # and add it as another, one-byte chunk.
1396 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
1397 pos += 1
1398 chunk_start = pos
1399 else:
1400 # Go on to the next character.
1401 pos += 1
1402 if chunk_start == 0:
1403 # The string is unchanged.
1404 return in_bytes
1405 else:
1406 # Store the final chunk.
1407 byte_chunks.append(in_bytes[chunk_start:])
1408 return b"".join(byte_chunks)