Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 41%

517 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

518

519 :param user_encodings: These encodings will be tried after the

520 ``known_definite_encodings`` have been tried and failed, and

521 after an attempt to sniff the encoding by looking at a

522 byte order mark has failed. In HTML terms, this

523 corresponds to the step "user has explicitly instructed

524 the user agent to override the document's character

525 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

526

527 :param override_encodings: A **deprecated** alias for

528 ``known_definite_encodings``. Any encodings here will be tried

529 immediately after the encodings in

530 ``known_definite_encodings``.

531

532 :param is_html: If True, this markup is considered to be

533 HTML. Otherwise it's assumed to be XML.

534

535 :param exclude_encodings: These encodings will not be tried,

536 even if they otherwise would be.

537

538 """

539

540 def __init__(

541 self,

542 markup: bytes,

543 known_definite_encodings: Optional[_Encodings] = None,

544 is_html: Optional[bool] = False,

545 exclude_encodings: Optional[_Encodings] = None,

546 user_encodings: Optional[_Encodings] = None,

547 override_encodings: Optional[_Encodings] = None,

548 ):

549 self.known_definite_encodings = list(known_definite_encodings or [])

550 if override_encodings:

551 warnings.warn(

552 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",

553 DeprecationWarning,

554 stacklevel=3,

555 )

556 self.known_definite_encodings += override_encodings

557 self.user_encodings = user_encodings or []

558 exclude_encodings = exclude_encodings or []

559 self.exclude_encodings = set([x.lower() for x in exclude_encodings])

560 self.chardet_encoding = None

561 self.is_html = False if is_html is None else is_html

562 self.declared_encoding: Optional[str] = None

563

564 # First order of business: strip a byte-order mark.

565 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)

566

567 known_definite_encodings: _Encodings

568 user_encodings: _Encodings

569 exclude_encodings: _Encodings

570 chardet_encoding: Optional[_Encoding]

571 is_html: bool

572 declared_encoding: Optional[_Encoding]

573 markup: bytes

574 sniffed_encoding: Optional[_Encoding]

575

576 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:

577 """Should we even bother to try this encoding?

578

579 :param encoding: Name of an encoding.

580 :param tried: Encodings that have already been tried. This

581 will be modified as a side effect.

582 """

583 if encoding is None:

584 return False

585 encoding = encoding.lower()

586 if encoding in self.exclude_encodings:

587 return False

588 if encoding not in tried:

589 tried.add(encoding)

590 return True

591 return False

592

593 @property

594 def encodings(self) -> Iterator[_Encoding]:

595 """Yield a number of encodings that might work for this markup.

596

597 :yield: A sequence of strings. Each is the name of an encoding

598 that *might* work to convert a bytestring into Unicode.

599 """

600 tried: Set[_Encoding] = set()

601

602 # First, try the known definite encodings

603 for e in self.known_definite_encodings:

604 if self._usable(e, tried):

605 yield e

606

607 # Did the document originally start with a byte-order mark

608 # that indicated its encoding?

609 if self.sniffed_encoding is not None and self._usable(

610 self.sniffed_encoding, tried

611 ):

612 yield self.sniffed_encoding

613

614 # Sniffing the byte-order mark did nothing; try the user

615 # encodings.

616 for e in self.user_encodings:

617 if self._usable(e, tried):

618 yield e

619

620 # Look within the document for an XML or HTML encoding

621 # declaration.

622 if self.declared_encoding is None:

623 self.declared_encoding = self.find_declared_encoding(

624 self.markup, self.is_html

625 )

626 if self.declared_encoding is not None and self._usable(

627 self.declared_encoding, tried

628 ):

629 yield self.declared_encoding

630

631 # Use third-party character set detection to guess at the

632 # encoding.

633 if self.chardet_encoding is None:

634 self.chardet_encoding = _chardet_dammit(self.markup)

635 if self.chardet_encoding is not None and self._usable(

636 self.chardet_encoding, tried

637 ):

638 yield self.chardet_encoding

639

640 # As a last-ditch effort, try utf-8 and windows-1252.

641 for e in ("utf-8", "windows-1252"):

642 if self._usable(e, tried):

643 yield e

644

645 @classmethod

646 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:

647 """If a byte-order mark is present, strip it and return the encoding it implies.

648

649 :param data: A bytestring that may or may not begin with a

650 byte-order mark.

651

652 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)

653 """

654 encoding = None

655 if isinstance(data, str):

656 # Unicode data cannot have a byte-order mark.

657 return data, encoding

658 if (

659 (len(data) >= 4)

660 and (data[:2] == b"\xfe\xff")

661 and (data[2:4] != b"\x00\x00")

662 ):

663 encoding = "utf-16be"

664 data = data[2:]

665 elif (

666 (len(data) >= 4)

667 and (data[:2] == b"\xff\xfe")

668 and (data[2:4] != b"\x00\x00")

669 ):

670 encoding = "utf-16le"

671 data = data[2:]

672 elif data[:3] == b"\xef\xbb\xbf":

673 encoding = "utf-8"

674 data = data[3:]

675 elif data[:4] == b"\x00\x00\xfe\xff":

676 encoding = "utf-32be"

677 data = data[4:]

678 elif data[:4] == b"\xff\xfe\x00\x00":

679 encoding = "utf-32le"

680 data = data[4:]

681 return data, encoding

682

683 @classmethod

684 def find_declared_encoding(

685 cls,

686 markup: Union[bytes, str],

687 is_html: bool = False,

688 search_entire_document: bool = False,

689 ) -> Optional[_Encoding]:

690 """Given a document, tries to find an encoding declared within the

691 text of the document itself.

692

693 An XML encoding is declared at the beginning of the document.

694

695 An HTML encoding is declared in a <meta> tag, hopefully near the

696 beginning of the document.

697

698 :param markup: Some markup.

699 :param is_html: If True, this markup is considered to be HTML. Otherwise

700 it's assumed to be XML.

701 :param search_entire_document: Since an encoding is supposed

702 to declared near the beginning of the document, most of

703 the time it's only necessary to search a few kilobytes of

704 data. Set this to True to force this method to search the

705 entire document.

706 :return: The declared encoding, if one is found.

707 """

708 if search_entire_document:

709 xml_endpos = html_endpos = len(markup)

710 else:

711 xml_endpos = 1024

712 html_endpos = max(2048, int(len(markup) * 0.05))

713

714 if isinstance(markup, bytes):

715 res = encoding_res[bytes]

716 else:

717 res = encoding_res[str]

718

719 xml_re = res["xml"]

720 html_re = res["html"]

721 declared_encoding: Optional[_Encoding] = None

722 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)

723 if not declared_encoding_match and is_html:

724 declared_encoding_match = html_re.search(markup, endpos=html_endpos)

725 if declared_encoding_match is not None:

726 declared_encoding = declared_encoding_match.groups()[0]

727 if declared_encoding:

728 if isinstance(declared_encoding, bytes):

729 declared_encoding = declared_encoding.decode("ascii", "replace")

730 return declared_encoding.lower()

731 return None

732

733

734class UnicodeDammit:

735 """A class for detecting the encoding of a bytestring containing an

736 HTML or XML document, and decoding it to Unicode. If the source

737 encoding is windows-1252, `UnicodeDammit` can also replace

738 Microsoft smart quotes with their HTML or XML equivalents.

739

740 :param markup: HTML or XML markup in an unknown encoding.

741

742 :param known_definite_encodings: When determining the encoding

743 of ``markup``, these encodings will be tried first, in

744 order. In HTML terms, this corresponds to the "known

745 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

746

747 :param user_encodings: These encodings will be tried after the

748 ``known_definite_encodings`` have been tried and failed, and

749 after an attempt to sniff the encoding by looking at a

750 byte order mark has failed. In HTML terms, this

751 corresponds to the step "user has explicitly instructed

752 the user agent to override the document's character

753 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

754

755 :param override_encodings: A **deprecated** alias for

756 ``known_definite_encodings``. Any encodings here will be tried

757 immediately after the encodings in

758 ``known_definite_encodings``.

759

760 :param smart_quotes_to: By default, Microsoft smart quotes will,

761 like all other characters, be converted to Unicode

762 characters. Setting this to ``ascii`` will convert them to ASCII

763 quotes instead. Setting it to ``xml`` will convert them to XML

764 entity references, and setting it to ``html`` will convert them

765 to HTML entity references.

766

767 :param is_html: If True, ``markup`` is treated as an HTML

768 document. Otherwise it's treated as an XML document.

769

770 :param exclude_encodings: These encodings will not be considered,

771 even if the sniffing code thinks they might make sense.

772

773 """

774

775 def __init__(

776 self,

777 markup: bytes,

778 known_definite_encodings: Optional[_Encodings] = [],

779 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,

780 is_html: bool = False,

781 exclude_encodings: Optional[_Encodings] = [],

782 user_encodings: Optional[_Encodings] = None,

783 override_encodings: Optional[_Encodings] = None,

784 ):

785 self.smart_quotes_to = smart_quotes_to

786 self.tried_encodings = []

787 self.contains_replacement_characters = False

788 self.is_html = is_html

789 self.log = getLogger(__name__)

790 self.detector = EncodingDetector(

791 markup,

792 known_definite_encodings,

793 is_html,

794 exclude_encodings,

795 user_encodings,

796 override_encodings,

797 )

798

799 # Short-circuit if the data is in Unicode to begin with.

800 if isinstance(markup, str):

801 self.markup = markup.encode("utf8")

802 self.unicode_markup = markup

803 self.original_encoding = None

804 return

805

806 # The encoding detector may have stripped a byte-order mark.

807 # Use the stripped markup from this point on.

808 self.markup = self.detector.markup

809

810 u = None

811 for encoding in self.detector.encodings:

812 markup = self.detector.markup

813 u = self._convert_from(encoding)

814 if u is not None:

815 break

816

817 if not u:

818 # None of the encodings worked. As an absolute last resort,

819 # try them again with character replacement.

820

821 for encoding in self.detector.encodings:

822 if encoding != "ascii":

823 u = self._convert_from(encoding, "replace")

824 if u is not None:

825 self.log.warning(

826 "Some characters could not be decoded, and were "

827 "replaced with REPLACEMENT CHARACTER."

828 )

829

830 self.contains_replacement_characters = True

831 break

832

833 # If none of that worked, we could at this point force it to

834 # ASCII, but that would destroy so much data that I think

835 # giving up is better.

836 #

837 # Note that this is extremely unlikely, probably impossible,

838 # because the "replace" strategy is so powerful. Even running

839 # the Python binary through Unicode, Dammit gives you Unicode,

840 # albeit Unicode riddled with REPLACEMENT CHARACTER.

841 if u is None:

842 self.original_encoding = None

843 self.unicode_markup = None

844 else:

845 self.unicode_markup = u

846

847 #: The original markup, before it was converted to Unicode.

848 #: This is not necessarily the same as what was passed in to the

849 #: constructor, since any byte-order mark will be stripped.

850 markup: bytes

851

852 #: The Unicode version of the markup, following conversion. This

853 #: is set to None if there was simply no way to convert the

854 #: bytestring to Unicode (as with binary data).

855 unicode_markup: Optional[str]

856

857 #: This is True if `UnicodeDammit.unicode_markup` contains

858 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present

859 #: in `UnicodeDammit.markup`. These mark character sequences that

860 #: could not be represented in Unicode.

861 contains_replacement_characters: bool

862

863 #: Unicode, Dammit's best guess as to the original character

864 #: encoding of `UnicodeDammit.markup`.

865 original_encoding: Optional[_Encoding]

866

867 #: The strategy used to handle Microsoft smart quotes.

868 smart_quotes_to: Optional[str]

869

870 #: The (encoding, error handling strategy) 2-tuples that were used to

871 #: try and convert the markup to Unicode.

872 tried_encodings: List[Tuple[_Encoding, str]]

873

874 log: Logger #: :meta private:

875

876 def _sub_ms_char(self, match: re.Match) -> bytes:

877 """Changes a MS smart quote character to an XML or HTML

878 entity, or an ASCII character.

879

880 TODO: Since this is only used to convert smart quotes, it

881 could be simplified, and MS_CHARS_TO_ASCII made much less

882 parochial.

883 """

884 orig: bytes = match.group(1)

885 sub: bytes

886 if self.smart_quotes_to == "ascii":

887 if orig in self.MS_CHARS_TO_ASCII:

888 sub = self.MS_CHARS_TO_ASCII[orig].encode()

889 else:

890 # Shouldn't happen; substitute the character

891 # with itself.

892 sub = orig

893 else:

894 if orig in self.MS_CHARS:

895 substitutions = self.MS_CHARS[orig]

896 if type(substitutions) is tuple:

897 if self.smart_quotes_to == "xml":

898 sub = b"&#x" + substitutions[1].encode() + b";"

899 else:

900 sub = b"&" + substitutions[0].encode() + b";"

901 else:

902 substitutions = cast(str, substitutions)

903 sub = substitutions.encode()

904 else:

905 # Shouldn't happen; substitute the character

906 # for itself.

907 sub = orig

908 return sub

909

910 #: This dictionary maps commonly seen values for "charset" in HTML

911 #: meta tags to the corresponding Python codec names. It only covers

912 #: values that aren't in Python's aliases and can't be determined

913 #: by the heuristics in `find_codec`.

914 #:

915 #: :meta hide-value:

916 CHARSET_ALIASES: Dict[str, _Encoding] = {

917 "macintosh": "mac-roman",

918 "x-sjis": "shift-jis",

919 }

920

921 #: A list of encodings that tend to contain Microsoft smart quotes.

922 #:

923 #: :meta hide-value:

924 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [

925 "windows-1252",

926 "iso-8859-1",

927 "iso-8859-2",

928 ]

929

930 def _convert_from(

931 self, proposed: _Encoding, errors: str = "strict"

932 ) -> Optional[str]:

933 """Attempt to convert the markup to the proposed encoding.

934

935 :param proposed: The name of a character encoding.

936 :param errors: An error handling strategy, used when calling `str`.

937 :return: The converted markup, or `None` if the proposed

938 encoding/error handling strategy didn't work.

939 """

940 lookup_result = self.find_codec(proposed)

941 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:

942 return None

943 proposed = lookup_result

944 self.tried_encodings.append((proposed, errors))

945 markup = self.markup

946 # Convert smart quotes to HTML if coming from an encoding

947 # that might have them.

948 if (

949 self.smart_quotes_to is not None

950 and proposed in self.ENCODINGS_WITH_SMART_QUOTES

951 ):

952 smart_quotes_re = b"([\x80-\x9f])"

953 smart_quotes_compiled = re.compile(smart_quotes_re)

954 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)

955

956 try:

957 # print("Trying to convert document to %s (errors=%s)" % (

958 # proposed, errors))

959 u = self._to_unicode(markup, proposed, errors)

960 self.unicode_markup = u

961 self.original_encoding = proposed

962 except Exception:

963 # print("That didn't work!")

964 # print(e)

965 return None

966 # print("Correct encoding: %s" % proposed)

967 return self.unicode_markup

968

969 def _to_unicode(

970 self, data: bytes, encoding: _Encoding, errors: str = "strict"

971 ) -> str:

972 """Given a bytestring and its encoding, decodes the string into Unicode.

973

974 :param encoding: The name of an encoding.

975 :param errors: An error handling strategy, used when calling `str`.

976 """

977 return str(data, encoding, errors)

978

979 @property

980 def declared_html_encoding(self) -> Optional[_Encoding]:

981 """If the markup is an HTML document, returns the encoding, if any,

982 declared *inside* the document.

983 """

984 if not self.is_html:

985 return None

986 return self.detector.declared_encoding

987

988 def find_codec(self, charset: _Encoding) -> Optional[str]:

989 """Look up the Python codec corresponding to a given character set.

990

991 :param charset: The name of a character set.

992 :return: The name of a Python codec.

993 """

994 value = (

995 self._codec(self.CHARSET_ALIASES.get(charset, charset))

996 or (charset and self._codec(charset.replace("-", "")))

997 or (charset and self._codec(charset.replace("-", "_")))

998 or (charset and charset.lower())

999 or charset

1000 )

1001 if value:

1002 return value.lower()

1003 return None

1004

1005 def _codec(self, charset: _Encoding) -> Optional[str]:

1006 if not charset:

1007 return charset

1008 codec = None

1009 try:

1010 codecs.lookup(charset)

1011 codec = charset

1012 except (LookupError, ValueError):

1013 pass

1014 return codec

1015

1016 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.

1017 #:

1018 #: :meta hide-value:

1019 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {

1020 b"\x80": ("euro", "20AC"),

1021 b"\x81": " ",

1022 b"\x82": ("sbquo", "201A"),

1023 b"\x83": ("fnof", "192"),

1024 b"\x84": ("bdquo", "201E"),

1025 b"\x85": ("hellip", "2026"),

1026 b"\x86": ("dagger", "2020"),

1027 b"\x87": ("Dagger", "2021"),

1028 b"\x88": ("circ", "2C6"),

1029 b"\x89": ("permil", "2030"),

1030 b"\x8a": ("Scaron", "160"),

1031 b"\x8b": ("lsaquo", "2039"),

1032 b"\x8c": ("OElig", "152"),

1033 b"\x8d": "?",

1034 b"\x8e": ("#x17D", "17D"),

1035 b"\x8f": "?",

1036 b"\x90": "?",

1037 b"\x91": ("lsquo", "2018"),

1038 b"\x92": ("rsquo", "2019"),

1039 b"\x93": ("ldquo", "201C"),

1040 b"\x94": ("rdquo", "201D"),

1041 b"\x95": ("bull", "2022"),

1042 b"\x96": ("ndash", "2013"),

1043 b"\x97": ("mdash", "2014"),

1044 b"\x98": ("tilde", "2DC"),

1045 b"\x99": ("trade", "2122"),

1046 b"\x9a": ("scaron", "161"),

1047 b"\x9b": ("rsaquo", "203A"),

1048 b"\x9c": ("oelig", "153"),

1049 b"\x9d": "?",

1050 b"\x9e": ("#x17E", "17E"),

1051 b"\x9f": ("Yuml", ""),

1052 }

1053

1054 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains

1055 #: horrors like stripping diacritical marks to turn á into a, but also

1056 #: contains non-horrors like turning “ into ".

1057 #:

1058 #: Seriously, don't use this for anything other than removing smart

1059 #: quotes.

1060 #:

1061 #: :meta private:

1062 MS_CHARS_TO_ASCII: Dict[bytes, str] = {

1063 b"\x80": "EUR",

1064 b"\x81": " ",

1065 b"\x82": ",",

1066 b"\x83": "f",

1067 b"\x84": ",,",

1068 b"\x85": "...",

1069 b"\x86": "+",

1070 b"\x87": "++",

1071 b"\x88": "^",

1072 b"\x89": "%",

1073 b"\x8a": "S",

1074 b"\x8b": "<",

1075 b"\x8c": "OE",

1076 b"\x8d": "?",

1077 b"\x8e": "Z",

1078 b"\x8f": "?",

1079 b"\x90": "?",

1080 b"\x91": "'",

1081 b"\x92": "'",

1082 b"\x93": '"',

1083 b"\x94": '"',

1084 b"\x95": "*",

1085 b"\x96": "-",

1086 b"\x97": "--",

1087 b"\x98": "~",

1088 b"\x99": "(TM)",

1089 b"\x9a": "s",

1090 b"\x9b": ">",

1091 b"\x9c": "oe",

1092 b"\x9d": "?",

1093 b"\x9e": "z",

1094 b"\x9f": "Y",

1095 b"\xa0": " ",

1096 b"\xa1": "!",

1097 b"\xa2": "c",

1098 b"\xa3": "GBP",

1099 b"\xa4": "$", # This approximation is especially parochial--this is the

1100 # generic currency symbol.

1101 b"\xa5": "YEN",

1102 b"\xa6": "|",

1103 b"\xa7": "S",

1104 b"\xa8": "..",

1105 b"\xa9": "",

1106 b"\xaa": "(th)",

1107 b"\xab": "<<",

1108 b"\xac": "!",

1109 b"\xad": " ",

1110 b"\xae": "(R)",

1111 b"\xaf": "-",

1112 b"\xb0": "o",

1113 b"\xb1": "+-",

1114 b"\xb2": "2",

1115 b"\xb3": "3",

1116 b"\xb4": "'",

1117 b"\xb5": "u",

1118 b"\xb6": "P",

1119 b"\xb7": "*",

1120 b"\xb8": ",",

1121 b"\xb9": "1",

1122 b"\xba": "(th)",

1123 b"\xbb": ">>",

1124 b"\xbc": "1/4",

1125 b"\xbd": "1/2",

1126 b"\xbe": "3/4",

1127 b"\xbf": "?",

1128 b"\xc0": "A",

1129 b"\xc1": "A",

1130 b"\xc2": "A",

1131 b"\xc3": "A",

1132 b"\xc4": "A",

1133 b"\xc5": "A",

1134 b"\xc6": "AE",

1135 b"\xc7": "C",

1136 b"\xc8": "E",

1137 b"\xc9": "E",

1138 b"\xca": "E",

1139 b"\xcb": "E",

1140 b"\xcc": "I",

1141 b"\xcd": "I",

1142 b"\xce": "I",

1143 b"\xcf": "I",

1144 b"\xd0": "D",

1145 b"\xd1": "N",

1146 b"\xd2": "O",

1147 b"\xd3": "O",

1148 b"\xd4": "O",

1149 b"\xd5": "O",

1150 b"\xd6": "O",

1151 b"\xd7": "*",

1152 b"\xd8": "O",

1153 b"\xd9": "U",

1154 b"\xda": "U",

1155 b"\xdb": "U",

1156 b"\xdc": "U",

1157 b"\xdd": "Y",

1158 b"\xde": "b",

1159 b"\xdf": "B",

1160 b"\xe0": "a",

1161 b"\xe1": "a",

1162 b"\xe2": "a",

1163 b"\xe3": "a",

1164 b"\xe4": "a",

1165 b"\xe5": "a",

1166 b"\xe6": "ae",

1167 b"\xe7": "c",

1168 b"\xe8": "e",

1169 b"\xe9": "e",

1170 b"\xea": "e",

1171 b"\xeb": "e",

1172 b"\xec": "i",

1173 b"\xed": "i",

1174 b"\xee": "i",

1175 b"\xef": "i",

1176 b"\xf0": "o",

1177 b"\xf1": "n",

1178 b"\xf2": "o",

1179 b"\xf3": "o",

1180 b"\xf4": "o",

1181 b"\xf5": "o",

1182 b"\xf6": "o",

1183 b"\xf7": "/",

1184 b"\xf8": "o",

1185 b"\xf9": "u",

1186 b"\xfa": "u",

1187 b"\xfb": "u",

1188 b"\xfc": "u",

1189 b"\xfd": "y",

1190 b"\xfe": "b",

1191 b"\xff": "y",

1192 }

1193

1194 #: A map used when removing rogue Windows-1252/ISO-8859-1

1195 #: characters in otherwise UTF-8 documents. Also used when a

1196 #: numeric character entity has been incorrectly encoded using the

1197 #: character's Windows-1252 encoding.

1198 #:

1199 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in

1200 #: Windows-1252.

1201 #:

1202 #: :meta hide-value:

1203 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {

1204 0x80: b"\xe2\x82\xac", # €

1205 0x82: b"\xe2\x80\x9a", # ‚

1206 0x83: b"\xc6\x92", # ƒ

1207 0x84: b"\xe2\x80\x9e", # „

1208 0x85: b"\xe2\x80\xa6", # …

1209 0x86: b"\xe2\x80\xa0", # †

1210 0x87: b"\xe2\x80\xa1", # ‡

1211 0x88: b"\xcb\x86", # ˆ

1212 0x89: b"\xe2\x80\xb0", # ‰

1213 0x8A: b"\xc5\xa0", # Š

1214 0x8B: b"\xe2\x80\xb9", # ‹

1215 0x8C: b"\xc5\x92", # Œ

1216 0x8E: b"\xc5\xbd", # Ž

1217 0x91: b"\xe2\x80\x98", # ‘

1218 0x92: b"\xe2\x80\x99", # ’

1219 0x93: b"\xe2\x80\x9c", # “

1220 0x94: b"\xe2\x80\x9d", # ”

1221 0x95: b"\xe2\x80\xa2", # •

1222 0x96: b"\xe2\x80\x93", # –

1223 0x97: b"\xe2\x80\x94", # —

1224 0x98: b"\xcb\x9c", # ˜

1225 0x99: b"\xe2\x84\xa2", # ™

1226 0x9A: b"\xc5\xa1", # š

1227 0x9B: b"\xe2\x80\xba", # ›

1228 0x9C: b"\xc5\x93", # œ

1229 0x9E: b"\xc5\xbe", # ž

1230 0x9F: b"\xc5\xb8", # Ÿ

1231 0xA0: b"\xc2\xa0", #

1232 0xA1: b"\xc2\xa1", # ¡

1233 0xA2: b"\xc2\xa2", # ¢

1234 0xA3: b"\xc2\xa3", # £

1235 0xA4: b"\xc2\xa4", # ¤

1236 0xA5: b"\xc2\xa5", # ¥

1237 0xA6: b"\xc2\xa6", # ¦

1238 0xA7: b"\xc2\xa7", # §

1239 0xA8: b"\xc2\xa8", # ¨

1241 0xAA: b"\xc2\xaa", # ª

1242 0xAB: b"\xc2\xab", # «

1243 0xAC: b"\xc2\xac", # ¬

1244 0xAD: b"\xc2\xad", #

1245 0xAE: b"\xc2\xae", # ®

1246 0xAF: b"\xc2\xaf", # ¯

1247 0xB0: b"\xc2\xb0", # °

1248 0xB1: b"\xc2\xb1", # ±

1249 0xB2: b"\xc2\xb2", # ²

1250 0xB3: b"\xc2\xb3", # ³

1251 0xB4: b"\xc2\xb4", # ´

1252 0xB5: b"\xc2\xb5", # µ

1253 0xB6: b"\xc2\xb6", # ¶

1254 0xB7: b"\xc2\xb7", # ·

1255 0xB8: b"\xc2\xb8", # ¸

1256 0xB9: b"\xc2\xb9", # ¹

1257 0xBA: b"\xc2\xba", # º

1258 0xBB: b"\xc2\xbb", # »

1259 0xBC: b"\xc2\xbc", # ¼

1260 0xBD: b"\xc2\xbd", # ½

1261 0xBE: b"\xc2\xbe", # ¾

1262 0xBF: b"\xc2\xbf", # ¿

1263 0xC0: b"\xc3\x80", # À

1264 0xC1: b"\xc3\x81", # Á

1265 0xC2: b"\xc3\x82", # Â

1266 0xC3: b"\xc3\x83", # Ã

1267 0xC4: b"\xc3\x84", # Ä

1268 0xC5: b"\xc3\x85", # Å

1269 0xC6: b"\xc3\x86", # Æ

1270 0xC7: b"\xc3\x87", # Ç

1271 0xC8: b"\xc3\x88", # È

1272 0xC9: b"\xc3\x89", # É

1273 0xCA: b"\xc3\x8a", # Ê

1274 0xCB: b"\xc3\x8b", # Ë

1275 0xCC: b"\xc3\x8c", # Ì

1276 0xCD: b"\xc3\x8d", # Í

1277 0xCE: b"\xc3\x8e", # Î

1278 0xCF: b"\xc3\x8f", # Ï

1279 0xD0: b"\xc3\x90", # Ð

1280 0xD1: b"\xc3\x91", # Ñ

1281 0xD2: b"\xc3\x92", # Ò

1282 0xD3: b"\xc3\x93", # Ó

1283 0xD4: b"\xc3\x94", # Ô

1284 0xD5: b"\xc3\x95", # Õ

1285 0xD6: b"\xc3\x96", # Ö

1286 0xD7: b"\xc3\x97", # ×

1287 0xD8: b"\xc3\x98", # Ø

1288 0xD9: b"\xc3\x99", # Ù

1289 0xDA: b"\xc3\x9a", # Ú

1290 0xDB: b"\xc3\x9b", # Û

1291 0xDC: b"\xc3\x9c", # Ü

1292 0xDD: b"\xc3\x9d", # Ý

1293 0xDE: b"\xc3\x9e", # Þ

1294 0xDF: b"\xc3\x9f", # ß

1295 0xE0: b"\xc3\xa0", # à

1296 0xE1: b"\xa1", # á

1297 0xE2: b"\xc3\xa2", # â

1298 0xE3: b"\xc3\xa3", # ã

1299 0xE4: b"\xc3\xa4", # ä

1300 0xE5: b"\xc3\xa5", # å

1301 0xE6: b"\xc3\xa6", # æ

1302 0xE7: b"\xc3\xa7", # ç

1303 0xE8: b"\xc3\xa8", # è

1304 0xE9: b"\xc3\xa9", # é

1305 0xEA: b"\xc3\xaa", # ê

1306 0xEB: b"\xc3\xab", # ë

1307 0xEC: b"\xc3\xac", # ì

1308 0xED: b"\xc3\xad", # í

1309 0xEE: b"\xc3\xae", # î

1310 0xEF: b"\xc3\xaf", # ï

1311 0xF0: b"\xc3\xb0", # ð

1312 0xF1: b"\xc3\xb1", # ñ

1313 0xF2: b"\xc3\xb2", # ò

1314 0xF3: b"\xc3\xb3", # ó

1315 0xF4: b"\xc3\xb4", # ô

1316 0xF5: b"\xc3\xb5", # õ

1317 0xF6: b"\xc3\xb6", # ö

1318 0xF7: b"\xc3\xb7", # ÷

1319 0xF8: b"\xc3\xb8", # ø

1320 0xF9: b"\xc3\xb9", # ù

1321 0xFA: b"\xc3\xba", # ú

1322 0xFB: b"\xc3\xbb", # û

1323 0xFC: b"\xc3\xbc", # ü

1324 0xFD: b"\xc3\xbd", # ý

1325 0xFE: b"\xc3\xbe", # þ

1326 0xFF: b"\xc3\xbf", # ÿ

1327 }

1328

1329 #: :meta private

1330 # Note that this isn't all Unicode noncharacters, just the noncontiguous ones that need to be listed.

1331 #

1332 # "A noncharacter is a code point that is in the range

1333 # U+FDD0 to U+FDEF, inclusive, or U+FFFE, U+FFFF, U+1FFFE,

1334 # U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE,

1335 # U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,

1336 # U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE,

1337 # U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,

1338 # U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE,

1339 # or U+10FFFF."

1340 ENUMERATED_NONCHARACTERS: Set[int] = set([0xfffe, 0xffff,

1341 0x1fffe, 0x1ffff,

1342 0x2fffe, 0x2ffff,

1343 0x3fffe, 0x3ffff,

1344 0x4fffe, 0x4ffff,

1345 0x5fffe, 0x5ffff,

1346 0x6fffe, 0x6ffff,

1347 0x7fffe, 0x7ffff,

1348 0x8fffe, 0x8ffff,

1349 0x9fffe, 0x9ffff,

1350 0xafffe, 0xaffff,

1351 0xbfffe, 0xbffff,

1352 0xcfffe, 0xcffff,

1353 0xdfffe, 0xdffff,

1354 0xefffe, 0xeffff,

1355 0xffffe, 0xfffff,

1356 0x10fffe, 0x10ffff])

1357

1358 #: :meta private:

1359 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [

1360 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF

1361 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF

1362 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4

1363 ]

1364

1365 #: :meta private:

1366 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]

1367

1368 #: :meta private:

1369 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]

1370

1371 @classmethod

1372 def numeric_character_reference(cls, numeric:int) -> Tuple[str, bool]:

1373 """This (mostly) implements the algorithm described in "Numeric character

1374 reference end state" from the HTML spec:

1375 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state

1376

1377 The algorithm is designed to convert numeric character references like "☃"

1378 to Unicode characters like "☃".

1379

1380 :return: A 2-tuple (character, replaced). `character` is the Unicode

1381 character corresponding to the numeric reference and `replaced` is

1382 whether or not an unresolvable character was replaced with REPLACEMENT

1383 CHARACTER.

1384 """

1385 replacement = "\ufffd"

1386

1387 if numeric == 0x00:

1388 # "If the number is 0x00, then this is a

1389 # null-character-reference parse error. Set the character

1390 # reference code to 0xFFFD."

1391 return replacement, True

1392

1393 if numeric > 0x10ffff:

1394 # "If the number is greater than 0x10FFFF, then this is a

1395 # character-reference-outside-unicode-range parse

1396 # error. Set the character reference code to 0xFFFD."

1397 return replacement, True

1398

1399 if numeric >= 0xd800 and numeric <= 0xdfff:

1400 # "If the number is a surrogate, then this is a

1401 # surrogate-character-reference parse error. Set the

1402 # character reference code to 0xFFFD."

1403 return replacement, True

1404

1405 if (numeric >= 0xfdd0 and numeric <= 0xfdef) or numeric in cls.ENUMERATED_NONCHARACTERS:

1406 # "If the number is a noncharacter, then this is a

1407 # noncharacter-character-reference parse error."

1408 #

1409 # "The parser resolves such character references as-is."

1410 #

1411 # I'm not sure what "as-is" means but I think it means that we act

1412 # like there was no error condition.

1413 return chr(numeric), False

1414

1415 # "If the number is 0x0D, or a control that's not ASCII whitespace,

1416 # then this is a control-character-reference parse error."

1417 #

1418 # "A control is a C0 control or a code point in the range

1419 # U+007F DELETE to U+009F APPLICATION PROGRAM COMMAND,

1420 # inclusive."

1421 #

1422 # "A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive."

1423 #

1424 # "The parser resolves such character references as-is except C1 control references that are replaced."

1425

1426 # First, let's replace the control references that can be replaced.

1427 if numeric >= 0x80 and numeric <= 0x9f and numeric in cls.WINDOWS_1252_TO_UTF8:

1428 # "If the number is one of the numbers in the first column of the

1429 # following table, then find the row with that number in the first

1430 # column, and set the character reference code to the number in the

1431 # second column of that row."

1432 #

1433 # This is an attempt to catch characters that were encoded to numeric

1434 # entities using their Windows-1252 encodings rather than their UTF-8

1435 # encodings.

1436 return cls.WINDOWS_1252_TO_UTF8[numeric].decode("utf8"), False

1437

1438 # Now all that's left are references that should be resolved as-is. This

1439 # is also the default path for non-weird character references.

1440 try:

1441 return chr(numeric), False

1442 except (ValueError, OverflowError):

1443 # This shouldn't happen, since these cases should have been handled

1444 # above, but if it does, return REPLACEMENT CHARACTER

1445 return replacement, True

1446

1447 @classmethod

1448 def detwingle(

1449 cls,

1450 in_bytes: bytes,

1451 main_encoding: _Encoding = "utf8",

1452 embedded_encoding: _Encoding = "windows-1252",

1453 ) -> bytes:

1454 """Fix characters from one encoding embedded in some other encoding.

1455

1456 Currently the only situation supported is Windows-1252 (or its

1457 subset ISO-8859-1), embedded in UTF-8.

1458

1459 :param in_bytes: A bytestring that you suspect contains

1460 characters from multiple encodings. Note that this *must*

1461 be a bytestring. If you've already converted the document

1462 to Unicode, you're too late.

1463 :param main_encoding: The primary encoding of ``in_bytes``.

1464 :param embedded_encoding: The encoding that was used to embed characters

1465 in the main document.

1466 :return: A bytestring similar to ``in_bytes``, in which

1467 ``embedded_encoding`` characters have been converted to

1468 their ``main_encoding`` equivalents.

1469 """

1470 if embedded_encoding.replace("_", "-").lower() not in (

1471 "windows-1252",

1472 "windows_1252",

1473 ):

1474 raise NotImplementedError(

1475 "Windows-1252 and ISO-8859-1 are the only currently supported "

1476 "embedded encodings."

1477 )

1478

1479 if main_encoding.lower() not in ("utf8", "utf-8"):

1480 raise NotImplementedError(

1481 "UTF-8 is the only currently supported main encoding."

1482 )

1483

1484 byte_chunks = []

1485

1486 chunk_start = 0

1487 pos = 0

1488 while pos < len(in_bytes):

1489 byte = in_bytes[pos]

1490 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:

1491 # This is the start of a UTF-8 multibyte character. Skip

1492 # to the end.

1493 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:

1494 if byte >= start and byte <= end:

1495 pos += size

1496 break

1497 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:

1498 # We found a Windows-1252 character!

1499 # Save the string up to this point as a chunk.

1500 byte_chunks.append(in_bytes[chunk_start:pos])

1501

1502 # Now translate the Windows-1252 character into UTF-8

1503 # and add it as another, one-byte chunk.

1504 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])

1505 pos += 1

1506 chunk_start = pos

1507 else:

1508 # Go on to the next character.

1509 pos += 1

1510 if chunk_start == 0:

1511 # The string is unchanged.

1512 return in_bytes

1513 else:

1514 # Store the final chunk.

1515 byte_chunks.append(in_bytes[chunk_start:])

1516 return b"".join(byte_chunks)