Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/dammit.py: 39%

517 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

518

519 :param user_encodings: These encodings will be tried after the

520 ``known_definite_encodings`` have been tried and failed, and

521 after an attempt to sniff the encoding by looking at a

522 byte order mark has failed. In HTML terms, this

523 corresponds to the step "user has explicitly instructed

524 the user agent to override the document's character

525 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

526

527 :param override_encodings: A **deprecated** alias for

528 ``known_definite_encodings``. Any encodings here will be tried

529 immediately after the encodings in

530 ``known_definite_encodings``.

531

532 :param is_html: If True, this markup is considered to be

533 HTML. Otherwise it's assumed to be XML.

534

535 :param exclude_encodings: These encodings will not be tried,

536 even if they otherwise would be.

537

538 """

539

540 def __init__(

541 self,

542 markup: bytes,

543 known_definite_encodings: Optional[_Encodings] = None,

544 is_html: Optional[bool] = False,

545 exclude_encodings: Optional[_Encodings] = None,

546 user_encodings: Optional[_Encodings] = None,

547 override_encodings: Optional[_Encodings] = None,

548 ):

549 self.known_definite_encodings = list(known_definite_encodings or [])

550 if override_encodings:

551 warnings.warn(

552 "The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",

553 DeprecationWarning,

554 stacklevel=3,

555 )

556 self.known_definite_encodings += override_encodings

557 self.user_encodings = user_encodings or []

558 exclude_encodings = exclude_encodings or []

559 self.exclude_encodings = set([x.lower() for x in exclude_encodings])

560 self.chardet_encoding = None

561 self.is_html = False if is_html is None else is_html

562 self.declared_encoding: Optional[str] = None

563

564 # First order of business: strip a byte-order mark.

565 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)

566

567 known_definite_encodings: _Encodings

568 user_encodings: _Encodings

569 exclude_encodings: _Encodings

570 chardet_encoding: Optional[_Encoding]

571 is_html: bool

572 declared_encoding: Optional[_Encoding]

573 markup: bytes

574 sniffed_encoding: Optional[_Encoding]

575

576 def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:

577 """Should we even bother to try this encoding?

578

579 :param encoding: Name of an encoding.

580 :param tried: Encodings that have already been tried. This

581 will be modified as a side effect.

582 """

583 if encoding is None:

584 return False

585 encoding = encoding.lower()

586 if encoding in self.exclude_encodings:

587 return False

588 if encoding not in tried:

589 tried.add(encoding)

590 return True

591 return False

592

593 @property

594 def encodings(self) -> Iterator[_Encoding]:

595 """Yield a number of encodings that might work for this markup.

596

597 :yield: A sequence of strings. Each is the name of an encoding

598 that *might* work to convert a bytestring into Unicode.

599 """

600 tried: Set[_Encoding] = set()

601

602 # First, try the known definite encodings

603 for e in self.known_definite_encodings:

604 if self._usable(e, tried):

605 yield e

606

607 # Did the document originally start with a byte-order mark

608 # that indicated its encoding?

609 if self.sniffed_encoding is not None and self._usable(

610 self.sniffed_encoding, tried

611 ):

612 yield self.sniffed_encoding

613

614 # Sniffing the byte-order mark did nothing; try the user

615 # encodings.

616 for e in self.user_encodings:

617 if self._usable(e, tried):

618 yield e

619

620 # Look within the document for an XML or HTML encoding

621 # declaration.

622 if self.declared_encoding is None:

623 self.declared_encoding = self.find_declared_encoding(

624 self.markup, self.is_html

625 )

626 if self.declared_encoding is not None and self._usable(

627 self.declared_encoding, tried

628 ):

629 yield self.declared_encoding

630

631 # Use third-party character set detection to guess at the

632 # encoding.

633 if self.chardet_encoding is None:

634 self.chardet_encoding = _chardet_dammit(self.markup)

635 if self.chardet_encoding is not None and self._usable(

636 self.chardet_encoding, tried

637 ):

638 yield self.chardet_encoding

639

640 # As a last-ditch effort, try utf-8 and windows-1252.

641 for e in ("utf-8", "windows-1252"):

642 if self._usable(e, tried):

643 yield e

644

645 @classmethod

646 def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:

647 """If a byte-order mark is present, strip it and return the encoding it implies.

648

649 :param data: A bytestring that may or may not begin with a

650 byte-order mark.

651

652 :return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)

653 """

654 encoding = None

655 if isinstance(data, str):

656 # Unicode data cannot have a byte-order mark.

657 return data, encoding

658 if (

659 (len(data) >= 4)

660 and (data[:2] == b"\xfe\xff")

661 and (data[2:4] != b"\x00\x00")

662 ):

663 encoding = "utf-16be"

664 data = data[2:]

665 elif (

666 (len(data) >= 4)

667 and (data[:2] == b"\xff\xfe")

668 and (data[2:4] != b"\x00\x00")

669 ):

670 encoding = "utf-16le"

671 data = data[2:]

672 elif data[:3] == b"\xef\xbb\xbf":

673 encoding = "utf-8"

674 data = data[3:]

675 elif data[:4] == b"\x00\x00\xfe\xff":

676 encoding = "utf-32be"

677 data = data[4:]

678 elif data[:4] == b"\xff\xfe\x00\x00":

679 encoding = "utf-32le"

680 data = data[4:]

681 return data, encoding

682

683 @classmethod

684 def find_declared_encoding(

685 cls,

686 markup: Union[bytes, str],

687 is_html: bool = False,

688 search_entire_document: bool = False,

689 ) -> Optional[_Encoding]:

690 """Given a document, tries to find an encoding declared within the

691 text of the document itself.

692

693 An XML encoding is declared at the beginning of the document.

694

695 An HTML encoding is declared in a <meta> tag, hopefully near the

696 beginning of the document.

697

698 :param markup: Some markup.

699 :param is_html: If True, this markup is considered to be HTML. Otherwise

700 it's assumed to be XML.

701 :param search_entire_document: Since an encoding is supposed

702 to declared near the beginning of the document, most of

703 the time it's only necessary to search a few kilobytes of

704 data. Set this to True to force this method to search the

705 entire document.

706 :return: The declared encoding, if one is found.

707 """

708 if search_entire_document:

709 xml_endpos = html_endpos = len(markup)

710 else:

711 xml_endpos = 1024

712 html_endpos = max(2048, int(len(markup) * 0.05))

713

714 if isinstance(markup, bytes):

715 res = encoding_res[bytes]

716 else:

717 res = encoding_res[str]

718

719 xml_re = res["xml"]

720 html_re = res["html"]

721 declared_encoding: Optional[_Encoding] = None

722 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)

723 if not declared_encoding_match and is_html:

724 declared_encoding_match = html_re.search(markup, endpos=html_endpos)

725 if declared_encoding_match is not None:

726 declared_encoding = declared_encoding_match.groups()[0]

727 if declared_encoding:

728 if isinstance(declared_encoding, bytes):

729 declared_encoding = declared_encoding.decode("ascii", "replace")

730 return declared_encoding.lower()

731 return None

732

733

734class UnicodeDammit:

735 """A class for detecting the encoding of a bytestring containing an

736 HTML or XML document, and decoding it to Unicode. If the source

737 encoding is windows-1252, `UnicodeDammit` can also replace

738 Microsoft smart quotes with their HTML or XML equivalents.

739

740 :param markup: HTML or XML markup in an unknown encoding.

741

742 :param known_definite_encodings: When determining the encoding

743 of ``markup``, these encodings will be tried first, in

744 order. In HTML terms, this corresponds to the "known

745 definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.

746

747 :param user_encodings: These encodings will be tried after the

748 ``known_definite_encodings`` have been tried and failed, and

749 after an attempt to sniff the encoding by looking at a

750 byte order mark has failed. In HTML terms, this

751 corresponds to the step "user has explicitly instructed

752 the user agent to override the document's character

753 encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.

754

755 :param override_encodings: A **deprecated** alias for

756 ``known_definite_encodings``. Any encodings here will be tried

757 immediately after the encodings in

758 ``known_definite_encodings``.

759

760 :param smart_quotes_to: By default, Microsoft smart quotes will,

761 like all other characters, be converted to Unicode

762 characters. Setting this to ``ascii`` will convert them to ASCII

763 quotes instead. Setting it to ``xml`` will convert them to XML

764 entity references, and setting it to ``html`` will convert them

765 to HTML entity references.

766

767 :param is_html: If True, ``markup`` is treated as an HTML

768 document. Otherwise it's treated as an XML document.

769

770 :param exclude_encodings: These encodings will not be considered,

771 even if the sniffing code thinks they might make sense.

772

773 """

774

775 def __init__(

776 self,

777 markup: bytes,

778 known_definite_encodings: Optional[_Encodings] = [],

779 smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,

780 is_html: bool = False,

781 exclude_encodings: Optional[_Encodings] = [],

782 user_encodings: Optional[_Encodings] = None,

783 override_encodings: Optional[_Encodings] = None,

784 ):

785 self.smart_quotes_to = smart_quotes_to

786 self.tried_encodings = []

787 self.contains_replacement_characters = False

788 self.is_html = is_html

789 self.log = getLogger(__name__)

790 self.detector = EncodingDetector(

791 markup,

792 known_definite_encodings,

793 is_html,

794 exclude_encodings,

795 user_encodings,

796 override_encodings,

797 )

798

799 # Short-circuit if the data is in Unicode to begin with.

800 if isinstance(markup, str) or markup == b"":

801 self.markup = markup

802 self.unicode_markup = str(markup)

803 self.original_encoding = None

804 return

805

806 # The encoding detector may have stripped a byte-order mark.

807 # Use the stripped markup from this point on.

808 self.markup = self.detector.markup

809

810 u = None

811 for encoding in self.detector.encodings:

812 markup = self.detector.markup

813 u = self._convert_from(encoding)

814 if u is not None:

815 break

816

817 if not u:

818 # None of the encodings worked. As an absolute last resort,

819 # try them again with character replacement.

820

821 for encoding in self.detector.encodings:

822 if encoding != "ascii":

823 u = self._convert_from(encoding, "replace")

824 if u is not None:

825 self.log.warning(

826 "Some characters could not be decoded, and were "

827 "replaced with REPLACEMENT CHARACTER."

828 )

829

830 self.contains_replacement_characters = True

831 break

832

833 # If none of that worked, we could at this point force it to

834 # ASCII, but that would destroy so much data that I think

835 # giving up is better.

836 #

837 # Note that this is extremely unlikely, probably impossible,

838 # because the "replace" strategy is so powerful. Even running

839 # the Python binary through Unicode, Dammit gives you Unicode,

840 # albeit Unicode riddled with REPLACEMENT CHARACTER.

841 if u is None:

842 self.original_encoding = None

843 self.unicode_markup = None

844 else:

845 self.unicode_markup = u

846

847 #: The original markup, before it was converted to Unicode.

848 #: This is not necessarily the same as what was passed in to the

849 #: constructor, since any byte-order mark will be stripped.

850 markup: bytes

851

852 #: The Unicode version of the markup, following conversion. This

853 #: is set to None if there was simply no way to convert the

854 #: bytestring to Unicode (as with binary data).

855 unicode_markup: Optional[str]

856

857 #: This is True if `UnicodeDammit.unicode_markup` contains

858 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present

859 #: in `UnicodeDammit.markup`. These mark character sequences that

860 #: could not be represented in Unicode.

861 contains_replacement_characters: bool

862

863 #: Unicode, Dammit's best guess as to the original character

864 #: encoding of `UnicodeDammit.markup`.

865 original_encoding: Optional[_Encoding]

866

867 #: The strategy used to handle Microsoft smart quotes.

868 smart_quotes_to: Optional[str]

869

870 #: The (encoding, error handling strategy) 2-tuples that were used to

871 #: try and convert the markup to Unicode.

872 tried_encodings: List[Tuple[_Encoding, str]]

873

874 log: Logger #: :meta private:

875

876 def _sub_ms_char(self, match: re.Match) -> bytes:

877 """Changes a MS smart quote character to an XML or HTML

878 entity, or an ASCII character.

879

880 TODO: Since this is only used to convert smart quotes, it

881 could be simplified, and MS_CHARS_TO_ASCII made much less

882 parochial.

883 """

884 orig: bytes = match.group(1)

885 sub: bytes

886 if self.smart_quotes_to == "ascii":

887 if orig in self.MS_CHARS_TO_ASCII:

888 sub = self.MS_CHARS_TO_ASCII[orig].encode()

889 else:

890 # Shouldn't happen; substitute the character

891 # with itself.

892 sub = orig

893 else:

894 if orig in self.MS_CHARS:

895 substitutions = self.MS_CHARS[orig]

896 if type(substitutions) is tuple:

897 if self.smart_quotes_to == "xml":

898 sub = b"&#x" + substitutions[1].encode() + b";"

899 else:

900 sub = b"&" + substitutions[0].encode() + b";"

901 else:

902 substitutions = cast(str, substitutions)

903 sub = substitutions.encode()

904 else:

905 # Shouldn't happen; substitute the character

906 # for itself.

907 sub = orig

908 return sub

909

910 #: This dictionary maps commonly seen values for "charset" in HTML

911 #: meta tags to the corresponding Python codec names. It only covers

912 #: values that aren't in Python's aliases and can't be determined

913 #: by the heuristics in `find_codec`.

914 #:

915 #: :meta hide-value:

916 CHARSET_ALIASES: Dict[str, _Encoding] = {

917 "macintosh": "mac-roman",

918 "x-sjis": "shift-jis",

919 }

920

921 #: A list of encodings that tend to contain Microsoft smart quotes.

922 #:

923 #: :meta hide-value:

924 ENCODINGS_WITH_SMART_QUOTES: _Encodings = [

925 "windows-1252",

926 "iso-8859-1",

927 "iso-8859-2",

928 ]

929

930 def _convert_from(

931 self, proposed: _Encoding, errors: str = "strict"

932 ) -> Optional[str]:

933 """Attempt to convert the markup to the proposed encoding.

934

935 :param proposed: The name of a character encoding.

936 :param errors: An error handling strategy, used when calling `str`.

937 :return: The converted markup, or `None` if the proposed

938 encoding/error handling strategy didn't work.

939 """

940 lookup_result = self.find_codec(proposed)

941 if lookup_result is None or (lookup_result, errors) in self.tried_encodings:

942 return None

943 proposed = lookup_result

944 self.tried_encodings.append((proposed, errors))

945 markup = self.markup

946 # Convert smart quotes to HTML if coming from an encoding

947 # that might have them.

948 if (

949 self.smart_quotes_to is not None

950 and proposed in self.ENCODINGS_WITH_SMART_QUOTES

951 ):

952 smart_quotes_re = b"([\x80-\x9f])"

953 smart_quotes_compiled = re.compile(smart_quotes_re)

954 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)

955

956 try:

957 # print("Trying to convert document to %s (errors=%s)" % (

958 # proposed, errors))

959 u = self._to_unicode(markup, proposed, errors)

960 self.unicode_markup = u

961 self.original_encoding = proposed

962 except Exception:

963 # print("That didn't work!")

964 # print(e)

965 return None

966 # print("Correct encoding: %s" % proposed)

967 return self.unicode_markup

968

969 def _to_unicode(

970 self, data: bytes, encoding: _Encoding, errors: str = "strict"

971 ) -> str:

972 """Given a bytestring and its encoding, decodes the string into Unicode.

973

974 :param encoding: The name of an encoding.

975 :param errors: An error handling strategy, used when calling `str`.

976 """

977 return str(data, encoding, errors)

978

979 @property

980 def declared_html_encoding(self) -> Optional[_Encoding]:

981 """If the markup is an HTML document, returns the encoding, if any,

982 declared *inside* the document.

983 """

984 if not self.is_html:

985 return None

986 return self.detector.declared_encoding

987

988 def find_codec(self, charset: _Encoding) -> Optional[str]:

989 """Look up the Python codec corresponding to a given character set.

990

991 :param charset: The name of a character set.

992 :return: The name of a Python codec.

993 """

994 value = (

995 self._codec(self.CHARSET_ALIASES.get(charset, charset))

996 or (charset and self._codec(charset.replace("-", "")))

997 or (charset and self._codec(charset.replace("-", "_")))

998 or (charset and charset.lower())

999 or charset

1000 )

1001 if value:

1002 return value.lower()

1003 return None

1004

1005 def _codec(self, charset: _Encoding) -> Optional[str]:

1006 if not charset:

1007 return charset

1008 codec = None

1009 try:

1010 codecs.lookup(charset)

1011 codec = charset

1012 except (LookupError, ValueError):

1013 pass

1014 return codec

1015

1016 #: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.

1017 #:

1018 #: :meta hide-value:

1019 MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {

1020 b"\x80": ("euro", "20AC"),

1021 b"\x81": " ",

1022 b"\x82": ("sbquo", "201A"),

1023 b"\x83": ("fnof", "192"),

1024 b"\x84": ("bdquo", "201E"),

1025 b"\x85": ("hellip", "2026"),

1026 b"\x86": ("dagger", "2020"),

1027 b"\x87": ("Dagger", "2021"),

1028 b"\x88": ("circ", "2C6"),

1029 b"\x89": ("permil", "2030"),

1030 b"\x8a": ("Scaron", "160"),

1031 b"\x8b": ("lsaquo", "2039"),

1032 b"\x8c": ("OElig", "152"),

1033 b"\x8d": "?",

1034 b"\x8e": ("#x17D", "17D"),

1035 b"\x8f": "?",

1036 b"\x90": "?",

1037 b"\x91": ("lsquo", "2018"),

1038 b"\x92": ("rsquo", "2019"),

1039 b"\x93": ("ldquo", "201C"),

1040 b"\x94": ("rdquo", "201D"),

1041 b"\x95": ("bull", "2022"),

1042 b"\x96": ("ndash", "2013"),

1043 b"\x97": ("mdash", "2014"),

1044 b"\x98": ("tilde", "2DC"),

1045 b"\x99": ("trade", "2122"),

1046 b"\x9a": ("scaron", "161"),

1047 b"\x9b": ("rsaquo", "203A"),

1048 b"\x9c": ("oelig", "153"),

1049 b"\x9d": "?",

1050 b"\x9e": ("#x17E", "17E"),

1051 b"\x9f": ("Yuml", ""),

1052 }

1053

1054 #: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains

1055 #: horrors like stripping diacritical marks to turn á into a, but also

1056 #: contains non-horrors like turning “ into ".

1057 #:

1058 #: Seriously, don't use this for anything other than removing smart

1059 #: quotes.

1060 #:

1061 #: :meta private:

1062 MS_CHARS_TO_ASCII: Dict[bytes, str] = {

1063 b"\x80": "EUR",

1064 b"\x81": " ",

1065 b"\x82": ",",

1066 b"\x83": "f",

1067 b"\x84": ",,",

1068 b"\x85": "...",

1069 b"\x86": "+",

1070 b"\x87": "++",

1071 b"\x88": "^",

1072 b"\x89": "%",

1073 b"\x8a": "S",

1074 b"\x8b": "<",

1075 b"\x8c": "OE",

1076 b"\x8d": "?",

1077 b"\x8e": "Z",

1078 b"\x8f": "?",

1079 b"\x90": "?",

1080 b"\x91": "'",

1081 b"\x92": "'",

1082 b"\x93": '"',

1083 b"\x94": '"',

1084 b"\x95": "*",

1085 b"\x96": "-",

1086 b"\x97": "--",

1087 b"\x98": "~",

1088 b"\x99": "(TM)",

1089 b"\x9a": "s",

1090 b"\x9b": ">",

1091 b"\x9c": "oe",

1092 b"\x9d": "?",

1093 b"\x9e": "z",

1094 b"\x9f": "Y",

1095 b"\xa0": " ",

1096 b"\xa1": "!",

1097 b"\xa2": "c",

1098 b"\xa3": "GBP",

1099 b"\xa4": "$", # This approximation is especially parochial--this is the

1100 # generic currency symbol.

1101 b"\xa5": "YEN",

1102 b"\xa6": "|",

1103 b"\xa7": "S",

1104 b"\xa8": "..",

1105 b"\xa9": "",

1106 b"\xaa": "(th)",

1107 b"\xab": "<<",

1108 b"\xac": "!",

1109 b"\xad": " ",

1110 b"\xae": "(R)",

1111 b"\xaf": "-",

1112 b"\xb0": "o",

1113 b"\xb1": "+-",

1114 b"\xb2": "2",

1115 b"\xb3": "3",

1116 b"\xb4": "'",

1117 b"\xb5": "u",

1118 b"\xb6": "P",

1119 b"\xb7": "*",

1120 b"\xb8": ",",

1121 b"\xb9": "1",

1122 b"\xba": "(th)",

1123 b"\xbb": ">>",

1124 b"\xbc": "1/4",

1125 b"\xbd": "1/2",

1126 b"\xbe": "3/4",

1127 b"\xbf": "?",

1128 b"\xc0": "A",

1129 b"\xc1": "A",

1130 b"\xc2": "A",

1131 b"\xc3": "A",

1132 b"\xc4": "A",

1133 b"\xc5": "A",

1134 b"\xc6": "AE",

1135 b"\xc7": "C",

1136 b"\xc8": "E",

1137 b"\xc9": "E",

1138 b"\xca": "E",

1139 b"\xcb": "E",

1140 b"\xcc": "I",

1141 b"\xcd": "I",

1142 b"\xce": "I",

1143 b"\xcf": "I",

1144 b"\xd0": "D",

1145 b"\xd1": "N",

1146 b"\xd2": "O",

1147 b"\xd3": "O",

1148 b"\xd4": "O",

1149 b"\xd5": "O",

1150 b"\xd6": "O",

1151 b"\xd7": "*",

1152 b"\xd8": "O",

1153 b"\xd9": "U",

1154 b"\xda": "U",

1155 b"\xdb": "U",

1156 b"\xdc": "U",

1157 b"\xdd": "Y",

1158 b"\xde": "b",

1159 b"\xdf": "B",

1160 b"\xe0": "a",

1161 b"\xe1": "a",

1162 b"\xe2": "a",

1163 b"\xe3": "a",

1164 b"\xe4": "a",

1165 b"\xe5": "a",

1166 b"\xe6": "ae",

1167 b"\xe7": "c",

1168 b"\xe8": "e",

1169 b"\xe9": "e",

1170 b"\xea": "e",

1171 b"\xeb": "e",

1172 b"\xec": "i",

1173 b"\xed": "i",

1174 b"\xee": "i",

1175 b"\xef": "i",

1176 b"\xf0": "o",

1177 b"\xf1": "n",

1178 b"\xf2": "o",

1179 b"\xf3": "o",

1180 b"\xf4": "o",

1181 b"\xf5": "o",

1182 b"\xf6": "o",

1183 b"\xf7": "/",

1184 b"\xf8": "o",

1185 b"\xf9": "u",

1186 b"\xfa": "u",

1187 b"\xfb": "u",

1188 b"\xfc": "u",

1189 b"\xfd": "y",

1190 b"\xfe": "b",

1191 b"\xff": "y",

1192 }

1193

1194 #: A map used when removing rogue Windows-1252/ISO-8859-1

1195 #: characters in otherwise UTF-8 documents.

1196 #:

1197 #: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in

1198 #: Windows-1252.

1199 #:

1200 #: :meta hide-value:

1201 WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {

1202 0x80: b"\xe2\x82\xac", # €

1203 0x82: b"\xe2\x80\x9a", # ‚

1204 0x83: b"\xc6\x92", # ƒ

1205 0x84: b"\xe2\x80\x9e", # „

1206 0x85: b"\xe2\x80\xa6", # …

1207 0x86: b"\xe2\x80\xa0", # †

1208 0x87: b"\xe2\x80\xa1", # ‡

1209 0x88: b"\xcb\x86", # ˆ

1210 0x89: b"\xe2\x80\xb0", # ‰

1211 0x8A: b"\xc5\xa0", # Š

1212 0x8B: b"\xe2\x80\xb9", # ‹

1213 0x8C: b"\xc5\x92", # Œ

1214 0x8E: b"\xc5\xbd", # Ž

1215 0x91: b"\xe2\x80\x98", # ‘

1216 0x92: b"\xe2\x80\x99", # ’

1217 0x93: b"\xe2\x80\x9c", # “

1218 0x94: b"\xe2\x80\x9d", # ”

1219 0x95: b"\xe2\x80\xa2", # •

1220 0x96: b"\xe2\x80\x93", # –

1221 0x97: b"\xe2\x80\x94", # —

1222 0x98: b"\xcb\x9c", # ˜

1223 0x99: b"\xe2\x84\xa2", # ™

1224 0x9A: b"\xc5\xa1", # š

1225 0x9B: b"\xe2\x80\xba", # ›

1226 0x9C: b"\xc5\x93", # œ

1227 0x9E: b"\xc5\xbe", # ž

1228 0x9F: b"\xc5\xb8", # Ÿ

1229 0xA0: b"\xc2\xa0", #

1230 0xA1: b"\xc2\xa1", # ¡

1231 0xA2: b"\xc2\xa2", # ¢

1232 0xA3: b"\xc2\xa3", # £

1233 0xA4: b"\xc2\xa4", # ¤

1234 0xA5: b"\xc2\xa5", # ¥

1235 0xA6: b"\xc2\xa6", # ¦

1236 0xA7: b"\xc2\xa7", # §

1237 0xA8: b"\xc2\xa8", # ¨

1239 0xAA: b"\xc2\xaa", # ª

1240 0xAB: b"\xc2\xab", # «

1241 0xAC: b"\xc2\xac", # ¬

1242 0xAD: b"\xc2\xad", #

1243 0xAE: b"\xc2\xae", # ®

1244 0xAF: b"\xc2\xaf", # ¯

1245 0xB0: b"\xc2\xb0", # °

1246 0xB1: b"\xc2\xb1", # ±

1247 0xB2: b"\xc2\xb2", # ²

1248 0xB3: b"\xc2\xb3", # ³

1249 0xB4: b"\xc2\xb4", # ´

1250 0xB5: b"\xc2\xb5", # µ

1251 0xB6: b"\xc2\xb6", # ¶

1252 0xB7: b"\xc2\xb7", # ·

1253 0xB8: b"\xc2\xb8", # ¸

1254 0xB9: b"\xc2\xb9", # ¹

1255 0xBA: b"\xc2\xba", # º

1256 0xBB: b"\xc2\xbb", # »

1257 0xBC: b"\xc2\xbc", # ¼

1258 0xBD: b"\xc2\xbd", # ½

1259 0xBE: b"\xc2\xbe", # ¾

1260 0xBF: b"\xc2\xbf", # ¿

1261 0xC0: b"\xc3\x80", # À

1262 0xC1: b"\xc3\x81", # Á

1263 0xC2: b"\xc3\x82", # Â

1264 0xC3: b"\xc3\x83", # Ã

1265 0xC4: b"\xc3\x84", # Ä

1266 0xC5: b"\xc3\x85", # Å

1267 0xC6: b"\xc3\x86", # Æ

1268 0xC7: b"\xc3\x87", # Ç

1269 0xC8: b"\xc3\x88", # È

1270 0xC9: b"\xc3\x89", # É

1271 0xCA: b"\xc3\x8a", # Ê

1272 0xCB: b"\xc3\x8b", # Ë

1273 0xCC: b"\xc3\x8c", # Ì

1274 0xCD: b"\xc3\x8d", # Í

1275 0xCE: b"\xc3\x8e", # Î

1276 0xCF: b"\xc3\x8f", # Ï

1277 0xD0: b"\xc3\x90", # Ð

1278 0xD1: b"\xc3\x91", # Ñ

1279 0xD2: b"\xc3\x92", # Ò

1280 0xD3: b"\xc3\x93", # Ó

1281 0xD4: b"\xc3\x94", # Ô

1282 0xD5: b"\xc3\x95", # Õ

1283 0xD6: b"\xc3\x96", # Ö

1284 0xD7: b"\xc3\x97", # ×

1285 0xD8: b"\xc3\x98", # Ø

1286 0xD9: b"\xc3\x99", # Ù

1287 0xDA: b"\xc3\x9a", # Ú

1288 0xDB: b"\xc3\x9b", # Û

1289 0xDC: b"\xc3\x9c", # Ü

1290 0xDD: b"\xc3\x9d", # Ý

1291 0xDE: b"\xc3\x9e", # Þ

1292 0xDF: b"\xc3\x9f", # ß

1293 0xE0: b"\xc3\xa0", # à

1294 0xE1: b"\xa1", # á

1295 0xE2: b"\xc3\xa2", # â

1296 0xE3: b"\xc3\xa3", # ã

1297 0xE4: b"\xc3\xa4", # ä

1298 0xE5: b"\xc3\xa5", # å

1299 0xE6: b"\xc3\xa6", # æ

1300 0xE7: b"\xc3\xa7", # ç

1301 0xE8: b"\xc3\xa8", # è

1302 0xE9: b"\xc3\xa9", # é

1303 0xEA: b"\xc3\xaa", # ê

1304 0xEB: b"\xc3\xab", # ë

1305 0xEC: b"\xc3\xac", # ì

1306 0xED: b"\xc3\xad", # í

1307 0xEE: b"\xc3\xae", # î

1308 0xEF: b"\xc3\xaf", # ï

1309 0xF0: b"\xc3\xb0", # ð

1310 0xF1: b"\xc3\xb1", # ñ

1311 0xF2: b"\xc3\xb2", # ò

1312 0xF3: b"\xc3\xb3", # ó

1313 0xF4: b"\xc3\xb4", # ô

1314 0xF5: b"\xc3\xb5", # õ

1315 0xF6: b"\xc3\xb6", # ö

1316 0xF7: b"\xc3\xb7", # ÷

1317 0xF8: b"\xc3\xb8", # ø

1318 0xF9: b"\xc3\xb9", # ù

1319 0xFA: b"\xc3\xba", # ú

1320 0xFB: b"\xc3\xbb", # û

1321 0xFC: b"\xc3\xbc", # ü

1322 0xFD: b"\xc3\xbd", # ý

1323 0xFE: b"\xc3\xbe", # þ

1324 }

1325

1326 #: :meta private:

1327 MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [

1328 (0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF

1329 (0xE0, 0xEF, 3), # 3-byte characters start with E0-EF

1330 (0xF0, 0xF4, 4), # 4-byte characters start with F0-F4

1331 ]

1332

1333 #: :meta private:

1334 FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]

1335

1336 #: :meta private:

1337 LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]

1338

1339 @classmethod

1340 def detwingle(

1341 cls,

1342 in_bytes: bytes,

1343 main_encoding: _Encoding = "utf8",

1344 embedded_encoding: _Encoding = "windows-1252",

1345 ) -> bytes:

1346 """Fix characters from one encoding embedded in some other encoding.

1347

1348 Currently the only situation supported is Windows-1252 (or its

1349 subset ISO-8859-1), embedded in UTF-8.

1350

1351 :param in_bytes: A bytestring that you suspect contains

1352 characters from multiple encodings. Note that this *must*

1353 be a bytestring. If you've already converted the document

1354 to Unicode, you're too late.

1355 :param main_encoding: The primary encoding of ``in_bytes``.

1356 :param embedded_encoding: The encoding that was used to embed characters

1357 in the main document.

1358 :return: A bytestring similar to ``in_bytes``, in which

1359 ``embedded_encoding`` characters have been converted to

1360 their ``main_encoding`` equivalents.

1361 """

1362 if embedded_encoding.replace("_", "-").lower() not in (

1363 "windows-1252",

1364 "windows_1252",

1365 ):

1366 raise NotImplementedError(

1367 "Windows-1252 and ISO-8859-1 are the only currently supported "

1368 "embedded encodings."

1369 )

1370

1371 if main_encoding.lower() not in ("utf8", "utf-8"):

1372 raise NotImplementedError(

1373 "UTF-8 is the only currently supported main encoding."

1374 )

1375

1376 byte_chunks = []

1377

1378 chunk_start = 0

1379 pos = 0

1380 while pos < len(in_bytes):

1381 byte = in_bytes[pos]

1382 if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:

1383 # This is the start of a UTF-8 multibyte character. Skip

1384 # to the end.

1385 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:

1386 if byte >= start and byte <= end:

1387 pos += size

1388 break

1389 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:

1390 # We found a Windows-1252 character!

1391 # Save the string up to this point as a chunk.

1392 byte_chunks.append(in_bytes[chunk_start:pos])

1393

1394 # Now translate the Windows-1252 character into UTF-8

1395 # and add it as another, one-byte chunk.

1396 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])

1397 pos += 1

1398 chunk_start = pos

1399 else:

1400 # Go on to the next character.

1401 pos += 1

1402 if chunk_start == 0:

1403 # The string is unchanged.

1404 return in_bytes

1405 else:

1406 # Store the final chunk.

1407 byte_chunks.append(in_bytes[chunk_start:])

1408 return b"".join(byte_chunks)