Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/dammit.py: 35%
306 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and HTML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9# Use of this source code is governed by the MIT license.
10__license__ = "MIT"
12from html.entities import codepoint2name
13from collections import defaultdict
14import codecs
15import re
16import logging
17import string
19# Import a library to autodetect character encodings. We'll support
20# any of a number of libraries that all support the same API:
21#
22# * cchardet
23# * chardet
24# * charset-normalizer
25chardet_module = None
26try:
27 # PyPI package: cchardet
28 import cchardet as chardet_module
29except ImportError:
30 try:
31 # Debian package: python-chardet
32 # PyPI package: chardet
33 import chardet as chardet_module
34 except ImportError:
35 try:
36 # PyPI package: charset-normalizer
37 import charset_normalizer as chardet_module
38 except ImportError:
39 # No chardet available.
40 chardet_module = None
42if chardet_module:
43 def chardet_dammit(s):
44 if isinstance(s, str):
45 return None
46 return chardet_module.detect(s)['encoding']
47else:
48 def chardet_dammit(s):
49 return None
51# Build bytestring and Unicode versions of regular expressions for finding
52# a declared encoding inside an XML or HTML document.
53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
55encoding_res = dict()
56encoding_res[bytes] = {
57 'html' : re.compile(html_meta.encode("ascii"), re.I),
58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
59}
60encoding_res[str] = {
61 'html' : re.compile(html_meta, re.I),
62 'xml' : re.compile(xml_encoding, re.I)
63}
65from html.entities import html5
67class EntitySubstitution(object):
68 """The ability to substitute XML or HTML entities for certain characters."""
70 def _populate_class_variables():
71 """Initialize variables used by this class to manage the plethora of
72 HTML5 named entities.
74 This function returns a 3-tuple containing two dictionaries
75 and a regular expression:
77 unicode_to_name - A mapping of Unicode strings like "⦨" to
78 entity names like "angmsdaa". When a single Unicode string has
79 multiple entity names, we try to choose the most commonly-used
80 name.
82 name_to_unicode: A mapping of entity names like "angmsdaa" to
83 Unicode strings like "⦨".
85 named_entity_re: A regular expression matching (almost) any
86 Unicode string that corresponds to an HTML5 named entity.
87 """
88 unicode_to_name = {}
89 name_to_unicode = {}
91 short_entities = set()
92 long_entities_by_first_character = defaultdict(set)
94 for name_with_semicolon, character in sorted(html5.items()):
95 # "It is intentional, for legacy compatibility, that many
96 # code points have multiple character reference names. For
97 # example, some appear both with and without the trailing
98 # semicolon, or with different capitalizations."
99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
100 #
101 # The parsers are in charge of handling (or not) character
102 # references with no trailing semicolon, so we remove the
103 # semicolon whenever it appears.
104 if name_with_semicolon.endswith(';'):
105 name = name_with_semicolon[:-1]
106 else:
107 name = name_with_semicolon
109 # When parsing HTML, we want to recognize any known named
110 # entity and convert it to a sequence of Unicode
111 # characters.
112 if name not in name_to_unicode:
113 name_to_unicode[name] = character
115 # When _generating_ HTML, we want to recognize special
116 # character sequences that _could_ be converted to named
117 # entities.
118 unicode_to_name[character] = name
120 # We also need to build a regular expression that lets us
121 # _find_ those characters in output strings so we can
122 # replace them.
123 #
124 # This is tricky, for two reasons.
126 if (len(character) == 1 and ord(character) < 128
127 and character not in '<>&'):
128 # First, it would be annoying to turn single ASCII
129 # characters like | into named entities like
130 # |. The exceptions are <>&, which we _must_
131 # turn into named entities to produce valid HTML.
132 continue
134 if len(character) > 1 and all(ord(x) < 128 for x in character):
135 # We also do not want to turn _combinations_ of ASCII
136 # characters like 'fj' into named entities like 'fj',
137 # though that's more debateable.
138 continue
140 # Second, some named entities have a Unicode value that's
141 # a subset of the Unicode value for some _other_ named
142 # entity. As an example, \u2267' is ≧,
143 # but '\u2267\u0338' is ≧̸. Our regular
144 # expression needs to match the first two characters of
145 # "\u2267\u0338foo", but only the first character of
146 # "\u2267foo".
147 #
148 # In this step, we build two sets of characters that
149 # _eventually_ need to go into the regular expression. But
150 # we won't know exactly what the regular expression needs
151 # to look like until we've gone through the entire list of
152 # named entities.
153 if len(character) == 1:
154 short_entities.add(character)
155 else:
156 long_entities_by_first_character[character[0]].add(character)
158 # Now that we've been through the entire list of entities, we
159 # can create a regular expression that matches any of them.
160 particles = set()
161 for short in short_entities:
162 long_versions = long_entities_by_first_character[short]
163 if not long_versions:
164 particles.add(short)
165 else:
166 ignore = "".join([x[1] for x in long_versions])
167 # This finds, e.g. \u2267 but only if it is _not_
168 # followed by \u0338.
169 particles.add("%s(?![%s])" % (short, ignore))
171 for long_entities in list(long_entities_by_first_character.values()):
172 for long_entity in long_entities:
173 particles.add(long_entity)
175 re_definition = "(%s)" % "|".join(particles)
177 # If an entity shows up in both html5 and codepoint2name, it's
178 # likely that HTML5 gives it several different names, such as
179 # 'rsquo' and 'rsquor'. When converting Unicode characters to
180 # named entities, the codepoint2name name should take
181 # precedence where possible, since that's the more easily
182 # recognizable one.
183 for codepoint, name in list(codepoint2name.items()):
184 character = chr(codepoint)
185 unicode_to_name[character] = name
187 return unicode_to_name, name_to_unicode, re.compile(re_definition)
188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
191 CHARACTER_TO_XML_ENTITY = {
192 "'": "apos",
193 '"': "quot",
194 "&": "amp",
195 "<": "lt",
196 ">": "gt",
197 }
199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
201 ")")
203 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
205 @classmethod
206 def _substitute_html_entity(cls, matchobj):
207 """Used with a regular expression to substitute the
208 appropriate HTML entity for a special character string."""
209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
210 return "&%s;" % entity
212 @classmethod
213 def _substitute_xml_entity(cls, matchobj):
214 """Used with a regular expression to substitute the
215 appropriate XML entity for a special character string."""
216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
217 return "&%s;" % entity
219 @classmethod
220 def quoted_attribute_value(self, value):
221 """Make a value into a quoted XML attribute, possibly escaping it.
223 Most strings will be quoted using double quotes.
225 Bob's Bar -> "Bob's Bar"
227 If a string contains double quotes, it will be quoted using
228 single quotes.
230 Welcome to "my bar" -> 'Welcome to "my bar"'
232 If a string contains both single and double quotes, the
233 double quotes will be escaped, and the string will be quoted
234 using double quotes.
236 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
237 """
238 quote_with = '"'
239 if '"' in value:
240 if "'" in value:
241 # The string contains both single and double
242 # quotes. Turn the double quotes into
243 # entities. We quote the double quotes rather than
244 # the single quotes because the entity name is
245 # """ whether this is HTML or XML. If we
246 # quoted the single quotes, we'd have to decide
247 # between ' and &squot;.
248 replace_with = """
249 value = value.replace('"', replace_with)
250 else:
251 # There are double quotes but no single quotes.
252 # We can use single quotes to quote the attribute.
253 quote_with = "'"
254 return quote_with + value + quote_with
256 @classmethod
257 def substitute_xml(cls, value, make_quoted_attribute=False):
258 """Substitute XML entities for special XML characters.
260 :param value: A string to be substituted. The less-than sign
261 will become <, the greater-than sign will become >,
262 and any ampersands will become &. If you want ampersands
263 that appear to be part of an entity definition to be left
264 alone, use substitute_xml_containing_entities() instead.
266 :param make_quoted_attribute: If True, then the string will be
267 quoted, as befits an attribute value.
268 """
269 # Escape angle brackets and ampersands.
270 value = cls.AMPERSAND_OR_BRACKET.sub(
271 cls._substitute_xml_entity, value)
273 if make_quoted_attribute:
274 value = cls.quoted_attribute_value(value)
275 return value
277 @classmethod
278 def substitute_xml_containing_entities(
279 cls, value, make_quoted_attribute=False):
280 """Substitute XML entities for special XML characters.
282 :param value: A string to be substituted. The less-than sign will
283 become <, the greater-than sign will become >, and any
284 ampersands that are not part of an entity defition will
285 become &.
287 :param make_quoted_attribute: If True, then the string will be
288 quoted, as befits an attribute value.
289 """
290 # Escape angle brackets, and ampersands that aren't part of
291 # entities.
292 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
293 cls._substitute_xml_entity, value)
295 if make_quoted_attribute:
296 value = cls.quoted_attribute_value(value)
297 return value
299 @classmethod
300 def substitute_html(cls, s):
301 """Replace certain Unicode characters with named HTML entities.
303 This differs from data.encode(encoding, 'xmlcharrefreplace')
304 in that the goal is to make the result more readable (to those
305 with ASCII displays) rather than to recover from
306 errors. There's absolutely nothing wrong with a UTF-8 string
307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
308 character with "é" will make it more readable to some
309 people.
311 :param s: A Unicode string.
312 """
313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
314 cls._substitute_html_entity, s)
317class EncodingDetector:
318 """Suggests a number of possible encodings for a bytestring.
320 Order of precedence:
322 1. Encodings you specifically tell EncodingDetector to try first
323 (the known_definite_encodings argument to the constructor).
325 2. An encoding determined by sniffing the document's byte-order mark.
327 3. Encodings you specifically tell EncodingDetector to try if
328 byte-order mark sniffing fails (the user_encodings argument to the
329 constructor).
331 4. An encoding declared within the bytestring itself, either in an
332 XML declaration (if the bytestring is to be interpreted as an XML
333 document), or in a <meta> tag (if the bytestring is to be
334 interpreted as an HTML document.)
336 5. An encoding detected through textual analysis by chardet,
337 cchardet, or a similar external library.
339 4. UTF-8.
341 5. Windows-1252.
343 """
344 def __init__(self, markup, known_definite_encodings=None,
345 is_html=False, exclude_encodings=None,
346 user_encodings=None, override_encodings=None):
347 """Constructor.
349 :param markup: Some markup in an unknown encoding.
351 :param known_definite_encodings: When determining the encoding
352 of `markup`, these encodings will be tried first, in
353 order. In HTML terms, this corresponds to the "known
354 definite encoding" step defined here:
355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
357 :param user_encodings: These encodings will be tried after the
358 `known_definite_encodings` have been tried and failed, and
359 after an attempt to sniff the encoding by looking at a
360 byte order mark has failed. In HTML terms, this
361 corresponds to the step "user has explicitly instructed
362 the user agent to override the document's character
363 encoding", defined here:
364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
366 :param override_encodings: A deprecated alias for
367 known_definite_encodings. Any encodings here will be tried
368 immediately after the encodings in
369 known_definite_encodings.
371 :param is_html: If True, this markup is considered to be
372 HTML. Otherwise it's assumed to be XML.
374 :param exclude_encodings: These encodings will not be tried,
375 even if they otherwise would be.
377 """
378 self.known_definite_encodings = list(known_definite_encodings or [])
379 if override_encodings:
380 self.known_definite_encodings += override_encodings
381 self.user_encodings = user_encodings or []
382 exclude_encodings = exclude_encodings or []
383 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
384 self.chardet_encoding = None
385 self.is_html = is_html
386 self.declared_encoding = None
388 # First order of business: strip a byte-order mark.
389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
391 def _usable(self, encoding, tried):
392 """Should we even bother to try this encoding?
394 :param encoding: Name of an encoding.
395 :param tried: Encodings that have already been tried. This will be modified
396 as a side effect.
397 """
398 if encoding is not None:
399 encoding = encoding.lower()
400 if encoding in self.exclude_encodings:
401 return False
402 if encoding not in tried:
403 tried.add(encoding)
404 return True
405 return False
407 @property
408 def encodings(self):
409 """Yield a number of encodings that might work for this markup.
411 :yield: A sequence of strings.
412 """
413 tried = set()
415 # First, try the known definite encodings
416 for e in self.known_definite_encodings:
417 if self._usable(e, tried):
418 yield e
420 # Did the document originally start with a byte-order mark
421 # that indicated its encoding?
422 if self._usable(self.sniffed_encoding, tried):
423 yield self.sniffed_encoding
425 # Sniffing the byte-order mark did nothing; try the user
426 # encodings.
427 for e in self.user_encodings:
428 if self._usable(e, tried):
429 yield e
431 # Look within the document for an XML or HTML encoding
432 # declaration.
433 if self.declared_encoding is None:
434 self.declared_encoding = self.find_declared_encoding(
435 self.markup, self.is_html)
436 if self._usable(self.declared_encoding, tried):
437 yield self.declared_encoding
439 # Use third-party character set detection to guess at the
440 # encoding.
441 if self.chardet_encoding is None:
442 self.chardet_encoding = chardet_dammit(self.markup)
443 if self._usable(self.chardet_encoding, tried):
444 yield self.chardet_encoding
446 # As a last-ditch effort, try utf-8 and windows-1252.
447 for e in ('utf-8', 'windows-1252'):
448 if self._usable(e, tried):
449 yield e
451 @classmethod
452 def strip_byte_order_mark(cls, data):
453 """If a byte-order mark is present, strip it and return the encoding it implies.
455 :param data: Some markup.
456 :return: A 2-tuple (modified data, implied encoding)
457 """
458 encoding = None
459 if isinstance(data, str):
460 # Unicode data cannot have a byte-order mark.
461 return data, encoding
462 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
463 and (data[2:4] != '\x00\x00'):
464 encoding = 'utf-16be'
465 data = data[2:]
466 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
467 and (data[2:4] != '\x00\x00'):
468 encoding = 'utf-16le'
469 data = data[2:]
470 elif data[:3] == b'\xef\xbb\xbf':
471 encoding = 'utf-8'
472 data = data[3:]
473 elif data[:4] == b'\x00\x00\xfe\xff':
474 encoding = 'utf-32be'
475 data = data[4:]
476 elif data[:4] == b'\xff\xfe\x00\x00':
477 encoding = 'utf-32le'
478 data = data[4:]
479 return data, encoding
481 @classmethod
482 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
483 """Given a document, tries to find its declared encoding.
485 An XML encoding is declared at the beginning of the document.
487 An HTML encoding is declared in a <meta> tag, hopefully near the
488 beginning of the document.
490 :param markup: Some markup.
491 :param is_html: If True, this markup is considered to be HTML. Otherwise
492 it's assumed to be XML.
493 :param search_entire_document: Since an encoding is supposed to declared near the beginning
494 of the document, most of the time it's only necessary to search a few kilobytes of data.
495 Set this to True to force this method to search the entire document.
496 """
497 if search_entire_document:
498 xml_endpos = html_endpos = len(markup)
499 else:
500 xml_endpos = 1024
501 html_endpos = max(2048, int(len(markup) * 0.05))
503 if isinstance(markup, bytes):
504 res = encoding_res[bytes]
505 else:
506 res = encoding_res[str]
508 xml_re = res['xml']
509 html_re = res['html']
510 declared_encoding = None
511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
512 if not declared_encoding_match and is_html:
513 declared_encoding_match = html_re.search(markup, endpos=html_endpos)
514 if declared_encoding_match is not None:
515 declared_encoding = declared_encoding_match.groups()[0]
516 if declared_encoding:
517 if isinstance(declared_encoding, bytes):
518 declared_encoding = declared_encoding.decode('ascii', 'replace')
519 return declared_encoding.lower()
520 return None
522class UnicodeDammit:
523 """A class for detecting the encoding of a *ML document and
524 converting it to a Unicode string. If the source encoding is
525 windows-1252, can replace MS smart quotes with their HTML or XML
526 equivalents."""
528 # This dictionary maps commonly seen values for "charset" in HTML
529 # meta tags to the corresponding Python codec names. It only covers
530 # values that aren't in Python's aliases and can't be determined
531 # by the heuristics in find_codec.
532 CHARSET_ALIASES = {"macintosh": "mac-roman",
533 "x-sjis": "shift-jis"}
535 ENCODINGS_WITH_SMART_QUOTES = [
536 "windows-1252",
537 "iso-8859-1",
538 "iso-8859-2",
539 ]
541 def __init__(self, markup, known_definite_encodings=[],
542 smart_quotes_to=None, is_html=False, exclude_encodings=[],
543 user_encodings=None, override_encodings=None
544 ):
545 """Constructor.
547 :param markup: A bytestring representing markup in an unknown encoding.
549 :param known_definite_encodings: When determining the encoding
550 of `markup`, these encodings will be tried first, in
551 order. In HTML terms, this corresponds to the "known
552 definite encoding" step defined here:
553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
555 :param user_encodings: These encodings will be tried after the
556 `known_definite_encodings` have been tried and failed, and
557 after an attempt to sniff the encoding by looking at a
558 byte order mark has failed. In HTML terms, this
559 corresponds to the step "user has explicitly instructed
560 the user agent to override the document's character
561 encoding", defined here:
562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
564 :param override_encodings: A deprecated alias for
565 known_definite_encodings. Any encodings here will be tried
566 immediately after the encodings in
567 known_definite_encodings.
569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
572 will convert them to HTML entity references.
573 :param is_html: If True, this markup is considered to be HTML. Otherwise
574 it's assumed to be XML.
575 :param exclude_encodings: These encodings will not be considered, even
576 if the sniffing code thinks they might make sense.
578 """
579 self.smart_quotes_to = smart_quotes_to
580 self.tried_encodings = []
581 self.contains_replacement_characters = False
582 self.is_html = is_html
583 self.log = logging.getLogger(__name__)
584 self.detector = EncodingDetector(
585 markup, known_definite_encodings, is_html, exclude_encodings,
586 user_encodings, override_encodings
587 )
589 # Short-circuit if the data is in Unicode to begin with.
590 if isinstance(markup, str) or markup == '':
591 self.markup = markup
592 self.unicode_markup = str(markup)
593 self.original_encoding = None
594 return
596 # The encoding detector may have stripped a byte-order mark.
597 # Use the stripped markup from this point on.
598 self.markup = self.detector.markup
600 u = None
601 for encoding in self.detector.encodings:
602 markup = self.detector.markup
603 u = self._convert_from(encoding)
604 if u is not None:
605 break
607 if not u:
608 # None of the encodings worked. As an absolute last resort,
609 # try them again with character replacement.
611 for encoding in self.detector.encodings:
612 if encoding != "ascii":
613 u = self._convert_from(encoding, "replace")
614 if u is not None:
615 self.log.warning(
616 "Some characters could not be decoded, and were "
617 "replaced with REPLACEMENT CHARACTER."
618 )
619 self.contains_replacement_characters = True
620 break
622 # If none of that worked, we could at this point force it to
623 # ASCII, but that would destroy so much data that I think
624 # giving up is better.
625 self.unicode_markup = u
626 if not u:
627 self.original_encoding = None
629 def _sub_ms_char(self, match):
630 """Changes a MS smart quote character to an XML or HTML
631 entity, or an ASCII character."""
632 orig = match.group(1)
633 if self.smart_quotes_to == 'ascii':
634 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
635 else:
636 sub = self.MS_CHARS.get(orig)
637 if type(sub) == tuple:
638 if self.smart_quotes_to == 'xml':
639 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
640 else:
641 sub = '&'.encode() + sub[0].encode() + ';'.encode()
642 else:
643 sub = sub.encode()
644 return sub
646 def _convert_from(self, proposed, errors="strict"):
647 """Attempt to convert the markup to the proposed encoding.
649 :param proposed: The name of a character encoding.
650 """
651 proposed = self.find_codec(proposed)
652 if not proposed or (proposed, errors) in self.tried_encodings:
653 return None
654 self.tried_encodings.append((proposed, errors))
655 markup = self.markup
656 # Convert smart quotes to HTML if coming from an encoding
657 # that might have them.
658 if (self.smart_quotes_to is not None
659 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
660 smart_quotes_re = b"([\x80-\x9f])"
661 smart_quotes_compiled = re.compile(smart_quotes_re)
662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
664 try:
665 #print("Trying to convert document to %s (errors=%s)" % (
666 # proposed, errors))
667 u = self._to_unicode(markup, proposed, errors)
668 self.markup = u
669 self.original_encoding = proposed
670 except Exception as e:
671 #print("That didn't work!")
672 #print(e)
673 return None
674 #print("Correct encoding: %s" % proposed)
675 return self.markup
677 def _to_unicode(self, data, encoding, errors="strict"):
678 """Given a string and its encoding, decodes the string into Unicode.
680 :param encoding: The name of an encoding.
681 """
682 return str(data, encoding, errors)
684 @property
685 def declared_html_encoding(self):
686 """If the markup is an HTML document, returns the encoding declared _within_
687 the document.
688 """
689 if not self.is_html:
690 return None
691 return self.detector.declared_encoding
693 def find_codec(self, charset):
694 """Convert the name of a character set to a codec name.
696 :param charset: The name of a character set.
697 :return: The name of a codec.
698 """
699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
700 or (charset and self._codec(charset.replace("-", "")))
701 or (charset and self._codec(charset.replace("-", "_")))
702 or (charset and charset.lower())
703 or charset
704 )
705 if value:
706 return value.lower()
707 return None
709 def _codec(self, charset):
710 if not charset:
711 return charset
712 codec = None
713 try:
714 codecs.lookup(charset)
715 codec = charset
716 except (LookupError, ValueError):
717 pass
718 return codec
721 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
722 MS_CHARS = {b'\x80': ('euro', '20AC'),
723 b'\x81': ' ',
724 b'\x82': ('sbquo', '201A'),
725 b'\x83': ('fnof', '192'),
726 b'\x84': ('bdquo', '201E'),
727 b'\x85': ('hellip', '2026'),
728 b'\x86': ('dagger', '2020'),
729 b'\x87': ('Dagger', '2021'),
730 b'\x88': ('circ', '2C6'),
731 b'\x89': ('permil', '2030'),
732 b'\x8A': ('Scaron', '160'),
733 b'\x8B': ('lsaquo', '2039'),
734 b'\x8C': ('OElig', '152'),
735 b'\x8D': '?',
736 b'\x8E': ('#x17D', '17D'),
737 b'\x8F': '?',
738 b'\x90': '?',
739 b'\x91': ('lsquo', '2018'),
740 b'\x92': ('rsquo', '2019'),
741 b'\x93': ('ldquo', '201C'),
742 b'\x94': ('rdquo', '201D'),
743 b'\x95': ('bull', '2022'),
744 b'\x96': ('ndash', '2013'),
745 b'\x97': ('mdash', '2014'),
746 b'\x98': ('tilde', '2DC'),
747 b'\x99': ('trade', '2122'),
748 b'\x9a': ('scaron', '161'),
749 b'\x9b': ('rsaquo', '203A'),
750 b'\x9c': ('oelig', '153'),
751 b'\x9d': '?',
752 b'\x9e': ('#x17E', '17E'),
753 b'\x9f': ('Yuml', ''),}
755 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
756 # horrors like stripping diacritical marks to turn á into a, but also
757 # contains non-horrors like turning “ into ".
758 MS_CHARS_TO_ASCII = {
759 b'\x80' : 'EUR',
760 b'\x81' : ' ',
761 b'\x82' : ',',
762 b'\x83' : 'f',
763 b'\x84' : ',,',
764 b'\x85' : '...',
765 b'\x86' : '+',
766 b'\x87' : '++',
767 b'\x88' : '^',
768 b'\x89' : '%',
769 b'\x8a' : 'S',
770 b'\x8b' : '<',
771 b'\x8c' : 'OE',
772 b'\x8d' : '?',
773 b'\x8e' : 'Z',
774 b'\x8f' : '?',
775 b'\x90' : '?',
776 b'\x91' : "'",
777 b'\x92' : "'",
778 b'\x93' : '"',
779 b'\x94' : '"',
780 b'\x95' : '*',
781 b'\x96' : '-',
782 b'\x97' : '--',
783 b'\x98' : '~',
784 b'\x99' : '(TM)',
785 b'\x9a' : 's',
786 b'\x9b' : '>',
787 b'\x9c' : 'oe',
788 b'\x9d' : '?',
789 b'\x9e' : 'z',
790 b'\x9f' : 'Y',
791 b'\xa0' : ' ',
792 b'\xa1' : '!',
793 b'\xa2' : 'c',
794 b'\xa3' : 'GBP',
795 b'\xa4' : '$', #This approximation is especially parochial--this is the
796 #generic currency symbol.
797 b'\xa5' : 'YEN',
798 b'\xa6' : '|',
799 b'\xa7' : 'S',
800 b'\xa8' : '..',
801 b'\xa9' : '',
802 b'\xaa' : '(th)',
803 b'\xab' : '<<',
804 b'\xac' : '!',
805 b'\xad' : ' ',
806 b'\xae' : '(R)',
807 b'\xaf' : '-',
808 b'\xb0' : 'o',
809 b'\xb1' : '+-',
810 b'\xb2' : '2',
811 b'\xb3' : '3',
812 b'\xb4' : ("'", 'acute'),
813 b'\xb5' : 'u',
814 b'\xb6' : 'P',
815 b'\xb7' : '*',
816 b'\xb8' : ',',
817 b'\xb9' : '1',
818 b'\xba' : '(th)',
819 b'\xbb' : '>>',
820 b'\xbc' : '1/4',
821 b'\xbd' : '1/2',
822 b'\xbe' : '3/4',
823 b'\xbf' : '?',
824 b'\xc0' : 'A',
825 b'\xc1' : 'A',
826 b'\xc2' : 'A',
827 b'\xc3' : 'A',
828 b'\xc4' : 'A',
829 b'\xc5' : 'A',
830 b'\xc6' : 'AE',
831 b'\xc7' : 'C',
832 b'\xc8' : 'E',
833 b'\xc9' : 'E',
834 b'\xca' : 'E',
835 b'\xcb' : 'E',
836 b'\xcc' : 'I',
837 b'\xcd' : 'I',
838 b'\xce' : 'I',
839 b'\xcf' : 'I',
840 b'\xd0' : 'D',
841 b'\xd1' : 'N',
842 b'\xd2' : 'O',
843 b'\xd3' : 'O',
844 b'\xd4' : 'O',
845 b'\xd5' : 'O',
846 b'\xd6' : 'O',
847 b'\xd7' : '*',
848 b'\xd8' : 'O',
849 b'\xd9' : 'U',
850 b'\xda' : 'U',
851 b'\xdb' : 'U',
852 b'\xdc' : 'U',
853 b'\xdd' : 'Y',
854 b'\xde' : 'b',
855 b'\xdf' : 'B',
856 b'\xe0' : 'a',
857 b'\xe1' : 'a',
858 b'\xe2' : 'a',
859 b'\xe3' : 'a',
860 b'\xe4' : 'a',
861 b'\xe5' : 'a',
862 b'\xe6' : 'ae',
863 b'\xe7' : 'c',
864 b'\xe8' : 'e',
865 b'\xe9' : 'e',
866 b'\xea' : 'e',
867 b'\xeb' : 'e',
868 b'\xec' : 'i',
869 b'\xed' : 'i',
870 b'\xee' : 'i',
871 b'\xef' : 'i',
872 b'\xf0' : 'o',
873 b'\xf1' : 'n',
874 b'\xf2' : 'o',
875 b'\xf3' : 'o',
876 b'\xf4' : 'o',
877 b'\xf5' : 'o',
878 b'\xf6' : 'o',
879 b'\xf7' : '/',
880 b'\xf8' : 'o',
881 b'\xf9' : 'u',
882 b'\xfa' : 'u',
883 b'\xfb' : 'u',
884 b'\xfc' : 'u',
885 b'\xfd' : 'y',
886 b'\xfe' : 'b',
887 b'\xff' : 'y',
888 }
890 # A map used when removing rogue Windows-1252/ISO-8859-1
891 # characters in otherwise UTF-8 documents.
892 #
893 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
894 # Windows-1252.
895 WINDOWS_1252_TO_UTF8 = {
896 0x80 : b'\xe2\x82\xac', # €
897 0x82 : b'\xe2\x80\x9a', # ‚
898 0x83 : b'\xc6\x92', # ƒ
899 0x84 : b'\xe2\x80\x9e', # „
900 0x85 : b'\xe2\x80\xa6', # …
901 0x86 : b'\xe2\x80\xa0', # †
902 0x87 : b'\xe2\x80\xa1', # ‡
903 0x88 : b'\xcb\x86', # ˆ
904 0x89 : b'\xe2\x80\xb0', # ‰
905 0x8a : b'\xc5\xa0', # Š
906 0x8b : b'\xe2\x80\xb9', # ‹
907 0x8c : b'\xc5\x92', # Œ
908 0x8e : b'\xc5\xbd', # Ž
909 0x91 : b'\xe2\x80\x98', # ‘
910 0x92 : b'\xe2\x80\x99', # ’
911 0x93 : b'\xe2\x80\x9c', # “
912 0x94 : b'\xe2\x80\x9d', # ”
913 0x95 : b'\xe2\x80\xa2', # •
914 0x96 : b'\xe2\x80\x93', # –
915 0x97 : b'\xe2\x80\x94', # —
916 0x98 : b'\xcb\x9c', # ˜
917 0x99 : b'\xe2\x84\xa2', # ™
918 0x9a : b'\xc5\xa1', # š
919 0x9b : b'\xe2\x80\xba', # ›
920 0x9c : b'\xc5\x93', # œ
921 0x9e : b'\xc5\xbe', # ž
922 0x9f : b'\xc5\xb8', # Ÿ
923 0xa0 : b'\xc2\xa0', #
924 0xa1 : b'\xc2\xa1', # ¡
925 0xa2 : b'\xc2\xa2', # ¢
926 0xa3 : b'\xc2\xa3', # £
927 0xa4 : b'\xc2\xa4', # ¤
928 0xa5 : b'\xc2\xa5', # ¥
929 0xa6 : b'\xc2\xa6', # ¦
930 0xa7 : b'\xc2\xa7', # §
931 0xa8 : b'\xc2\xa8', # ¨
932 0xa9 : b'\xc2\xa9', # ©
933 0xaa : b'\xc2\xaa', # ª
934 0xab : b'\xc2\xab', # «
935 0xac : b'\xc2\xac', # ¬
936 0xad : b'\xc2\xad', #
937 0xae : b'\xc2\xae', # ®
938 0xaf : b'\xc2\xaf', # ¯
939 0xb0 : b'\xc2\xb0', # °
940 0xb1 : b'\xc2\xb1', # ±
941 0xb2 : b'\xc2\xb2', # ²
942 0xb3 : b'\xc2\xb3', # ³
943 0xb4 : b'\xc2\xb4', # ´
944 0xb5 : b'\xc2\xb5', # µ
945 0xb6 : b'\xc2\xb6', # ¶
946 0xb7 : b'\xc2\xb7', # ·
947 0xb8 : b'\xc2\xb8', # ¸
948 0xb9 : b'\xc2\xb9', # ¹
949 0xba : b'\xc2\xba', # º
950 0xbb : b'\xc2\xbb', # »
951 0xbc : b'\xc2\xbc', # ¼
952 0xbd : b'\xc2\xbd', # ½
953 0xbe : b'\xc2\xbe', # ¾
954 0xbf : b'\xc2\xbf', # ¿
955 0xc0 : b'\xc3\x80', # À
956 0xc1 : b'\xc3\x81', # Á
957 0xc2 : b'\xc3\x82', # Â
958 0xc3 : b'\xc3\x83', # Ã
959 0xc4 : b'\xc3\x84', # Ä
960 0xc5 : b'\xc3\x85', # Å
961 0xc6 : b'\xc3\x86', # Æ
962 0xc7 : b'\xc3\x87', # Ç
963 0xc8 : b'\xc3\x88', # È
964 0xc9 : b'\xc3\x89', # É
965 0xca : b'\xc3\x8a', # Ê
966 0xcb : b'\xc3\x8b', # Ë
967 0xcc : b'\xc3\x8c', # Ì
968 0xcd : b'\xc3\x8d', # Í
969 0xce : b'\xc3\x8e', # Î
970 0xcf : b'\xc3\x8f', # Ï
971 0xd0 : b'\xc3\x90', # Ð
972 0xd1 : b'\xc3\x91', # Ñ
973 0xd2 : b'\xc3\x92', # Ò
974 0xd3 : b'\xc3\x93', # Ó
975 0xd4 : b'\xc3\x94', # Ô
976 0xd5 : b'\xc3\x95', # Õ
977 0xd6 : b'\xc3\x96', # Ö
978 0xd7 : b'\xc3\x97', # ×
979 0xd8 : b'\xc3\x98', # Ø
980 0xd9 : b'\xc3\x99', # Ù
981 0xda : b'\xc3\x9a', # Ú
982 0xdb : b'\xc3\x9b', # Û
983 0xdc : b'\xc3\x9c', # Ü
984 0xdd : b'\xc3\x9d', # Ý
985 0xde : b'\xc3\x9e', # Þ
986 0xdf : b'\xc3\x9f', # ß
987 0xe0 : b'\xc3\xa0', # à
988 0xe1 : b'\xa1', # á
989 0xe2 : b'\xc3\xa2', # â
990 0xe3 : b'\xc3\xa3', # ã
991 0xe4 : b'\xc3\xa4', # ä
992 0xe5 : b'\xc3\xa5', # å
993 0xe6 : b'\xc3\xa6', # æ
994 0xe7 : b'\xc3\xa7', # ç
995 0xe8 : b'\xc3\xa8', # è
996 0xe9 : b'\xc3\xa9', # é
997 0xea : b'\xc3\xaa', # ê
998 0xeb : b'\xc3\xab', # ë
999 0xec : b'\xc3\xac', # ì
1000 0xed : b'\xc3\xad', # í
1001 0xee : b'\xc3\xae', # î
1002 0xef : b'\xc3\xaf', # ï
1003 0xf0 : b'\xc3\xb0', # ð
1004 0xf1 : b'\xc3\xb1', # ñ
1005 0xf2 : b'\xc3\xb2', # ò
1006 0xf3 : b'\xc3\xb3', # ó
1007 0xf4 : b'\xc3\xb4', # ô
1008 0xf5 : b'\xc3\xb5', # õ
1009 0xf6 : b'\xc3\xb6', # ö
1010 0xf7 : b'\xc3\xb7', # ÷
1011 0xf8 : b'\xc3\xb8', # ø
1012 0xf9 : b'\xc3\xb9', # ù
1013 0xfa : b'\xc3\xba', # ú
1014 0xfb : b'\xc3\xbb', # û
1015 0xfc : b'\xc3\xbc', # ü
1016 0xfd : b'\xc3\xbd', # ý
1017 0xfe : b'\xc3\xbe', # þ
1018 }
1020 MULTIBYTE_MARKERS_AND_SIZES = [
1021 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
1022 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
1023 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
1024 ]
1026 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
1027 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
1029 @classmethod
1030 def detwingle(cls, in_bytes, main_encoding="utf8",
1031 embedded_encoding="windows-1252"):
1032 """Fix characters from one encoding embedded in some other encoding.
1034 Currently the only situation supported is Windows-1252 (or its
1035 subset ISO-8859-1), embedded in UTF-8.
1037 :param in_bytes: A bytestring that you suspect contains
1038 characters from multiple encodings. Note that this _must_
1039 be a bytestring. If you've already converted the document
1040 to Unicode, you're too late.
1041 :param main_encoding: The primary encoding of `in_bytes`.
1042 :param embedded_encoding: The encoding that was used to embed characters
1043 in the main document.
1044 :return: A bytestring in which `embedded_encoding`
1045 characters have been converted to their `main_encoding`
1046 equivalents.
1047 """
1048 if embedded_encoding.replace('_', '-').lower() not in (
1049 'windows-1252', 'windows_1252'):
1050 raise NotImplementedError(
1051 "Windows-1252 and ISO-8859-1 are the only currently supported "
1052 "embedded encodings.")
1054 if main_encoding.lower() not in ('utf8', 'utf-8'):
1055 raise NotImplementedError(
1056 "UTF-8 is the only currently supported main encoding.")
1058 byte_chunks = []
1060 chunk_start = 0
1061 pos = 0
1062 while pos < len(in_bytes):
1063 byte = in_bytes[pos]
1064 if not isinstance(byte, int):
1065 # Python 2.x
1066 byte = ord(byte)
1067 if (byte >= cls.FIRST_MULTIBYTE_MARKER
1068 and byte <= cls.LAST_MULTIBYTE_MARKER):
1069 # This is the start of a UTF-8 multibyte character. Skip
1070 # to the end.
1071 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
1072 if byte >= start and byte <= end:
1073 pos += size
1074 break
1075 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
1076 # We found a Windows-1252 character!
1077 # Save the string up to this point as a chunk.
1078 byte_chunks.append(in_bytes[chunk_start:pos])
1080 # Now translate the Windows-1252 character into UTF-8
1081 # and add it as another, one-byte chunk.
1082 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
1083 pos += 1
1084 chunk_start = pos
1085 else:
1086 # Go on to the next character.
1087 pos += 1
1088 if chunk_start == 0:
1089 # The string is unchanged.
1090 return in_bytes
1091 else:
1092 # Store the final chunk.
1093 byte_chunks.append(in_bytes[chunk_start:])
1094 return b''.join(byte_chunks)