Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/dammit.py: 35%

1# -*- coding: utf-8 -*-

2"""Beautiful Soup bonus library: Unicode, Dammit

4This library converts a bytestream to Unicode through any means

5necessary. It is heavily based on code from Mark Pilgrim's Universal

6Feed Parser. It works best on XML and HTML, but it does not rewrite the

7XML or HTML to reflect a new encoding; that's the tree builder's job.

8"""

9# Use of this source code is governed by the MIT license.

10__license__ = "MIT"

12from html.entities import codepoint2name

13from collections import defaultdict

14import codecs

15import re

16import logging

17import string

19# Import a library to autodetect character encodings. We'll support

20# any of a number of libraries that all support the same API:

21#

22# * cchardet

23# * chardet

24# * charset-normalizer

25chardet_module = None

26try:

27 # PyPI package: cchardet

28 import cchardet as chardet_module

29except ImportError:

30 try:

31 # Debian package: python-chardet

32 # PyPI package: chardet

33 import chardet as chardet_module

34 except ImportError:

35 try:

36 # PyPI package: charset-normalizer

37 import charset_normalizer as chardet_module

38 except ImportError:

39 # No chardet available.

40 chardet_module = None

42if chardet_module:

43 def chardet_dammit(s):

44 if isinstance(s, str):

45 return None

46 return chardet_module.detect(s)['encoding']

47else:

48 def chardet_dammit(s):

49 return None

51# Build bytestring and Unicode versions of regular expressions for finding

52# a declared encoding inside an XML or HTML document.

53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'

54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'

55encoding_res = dict()

56encoding_res[bytes] = {

57 'html' : re.compile(html_meta.encode("ascii"), re.I),

58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),

59}

60encoding_res[str] = {

61 'html' : re.compile(html_meta, re.I),

62 'xml' : re.compile(xml_encoding, re.I)

63}

65from html.entities import html5

67class EntitySubstitution(object):

68 """The ability to substitute XML or HTML entities for certain characters."""

70 def _populate_class_variables():

71 """Initialize variables used by this class to manage the plethora of

72 HTML5 named entities.

74 This function returns a 3-tuple containing two dictionaries

75 and a regular expression:

77 unicode_to_name - A mapping of Unicode strings like "⦨" to

78 entity names like "angmsdaa". When a single Unicode string has

79 multiple entity names, we try to choose the most commonly-used

80 name.

82 name_to_unicode: A mapping of entity names like "angmsdaa" to

83 Unicode strings like "⦨".

85 named_entity_re: A regular expression matching (almost) any

86 Unicode string that corresponds to an HTML5 named entity.

87 """

88 unicode_to_name = {}

89 name_to_unicode = {}

91 short_entities = set()

92 long_entities_by_first_character = defaultdict(set)

94 for name_with_semicolon, character in sorted(html5.items()):

95 # "It is intentional, for legacy compatibility, that many

96 # code points have multiple character reference names. For

97 # example, some appear both with and without the trailing

98 # semicolon, or with different capitalizations."

99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references

100 #

101 # The parsers are in charge of handling (or not) character

102 # references with no trailing semicolon, so we remove the

103 # semicolon whenever it appears.

104 if name_with_semicolon.endswith(';'):

105 name = name_with_semicolon[:-1]

106 else:

107 name = name_with_semicolon

108

109 # When parsing HTML, we want to recognize any known named

110 # entity and convert it to a sequence of Unicode

111 # characters.

112 if name not in name_to_unicode:

113 name_to_unicode[name] = character

114

115 # When _generating_ HTML, we want to recognize special

116 # character sequences that _could_ be converted to named

117 # entities.

118 unicode_to_name[character] = name

119

120 # We also need to build a regular expression that lets us

121 # _find_ those characters in output strings so we can

122 # replace them.

123 #

124 # This is tricky, for two reasons.

125

126 if (len(character) == 1 and ord(character) < 128

127 and character not in '<>&'):

128 # First, it would be annoying to turn single ASCII

129 # characters like | into named entities like

130 # |. The exceptions are <>&, which we _must_

131 # turn into named entities to produce valid HTML.

132 continue

133

134 if len(character) > 1 and all(ord(x) < 128 for x in character):

135 # We also do not want to turn _combinations_ of ASCII

136 # characters like 'fj' into named entities like '&fjlig;',

137 # though that's more debateable.

138 continue

139

140 # Second, some named entities have a Unicode value that's

141 # a subset of the Unicode value for some _other_ named

142 # entity. As an example, \u2267' is &GreaterFullEqual;,

143 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular

144 # expression needs to match the first two characters of

145 # "\u2267\u0338foo", but only the first character of

146 # "\u2267foo".

147 #

148 # In this step, we build two sets of characters that

149 # _eventually_ need to go into the regular expression. But

150 # we won't know exactly what the regular expression needs

151 # to look like until we've gone through the entire list of

152 # named entities.

153 if len(character) == 1:

154 short_entities.add(character)

155 else:

156 long_entities_by_first_character[character[0]].add(character)

157

158 # Now that we've been through the entire list of entities, we

159 # can create a regular expression that matches any of them.

160 particles = set()

161 for short in short_entities:

162 long_versions = long_entities_by_first_character[short]

163 if not long_versions:

164 particles.add(short)

165 else:

166 ignore = "".join([x[1] for x in long_versions])

167 # This finds, e.g. \u2267 but only if it is _not_

168 # followed by \u0338.

169 particles.add("%s(?![%s])" % (short, ignore))

170

171 for long_entities in list(long_entities_by_first_character.values()):

172 for long_entity in long_entities:

173 particles.add(long_entity)

174

175 re_definition = "(%s)" % "|".join(particles)

176

177 # If an entity shows up in both html5 and codepoint2name, it's

178 # likely that HTML5 gives it several different names, such as

179 # 'rsquo' and 'rsquor'. When converting Unicode characters to

180 # named entities, the codepoint2name name should take

181 # precedence where possible, since that's the more easily

182 # recognizable one.

183 for codepoint, name in list(codepoint2name.items()):

184 character = chr(codepoint)

185 unicode_to_name[character] = name

186

187 return unicode_to_name, name_to_unicode, re.compile(re_definition)

188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,

189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()

190

191 CHARACTER_TO_XML_ENTITY = {

192 "'": "apos",

193 '"': "quot",

194 "&": "amp",

195 "<": "lt",

196 ">": "gt",

197 }

198

199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"

200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"

201 ")")

202

203 AMPERSAND_OR_BRACKET = re.compile("([<>&])")

204

205 @classmethod

206 def _substitute_html_entity(cls, matchobj):

207 """Used with a regular expression to substitute the

208 appropriate HTML entity for a special character string."""

209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))

210 return "&%s;" % entity

211

212 @classmethod

213 def _substitute_xml_entity(cls, matchobj):

214 """Used with a regular expression to substitute the

215 appropriate XML entity for a special character string."""

216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]

217 return "&%s;" % entity

218

219 @classmethod

220 def quoted_attribute_value(self, value):

221 """Make a value into a quoted XML attribute, possibly escaping it.

222

223 Most strings will be quoted using double quotes.

224

225 Bob's Bar -> "Bob's Bar"

226

227 If a string contains double quotes, it will be quoted using

228 single quotes.

229

230 Welcome to "my bar" -> 'Welcome to "my bar"'

231

232 If a string contains both single and double quotes, the

233 double quotes will be escaped, and the string will be quoted

234 using double quotes.

235

236 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"

237 """

238 quote_with = '"'

239 if '"' in value:

240 if "'" in value:

241 # The string contains both single and double

242 # quotes. Turn the double quotes into

243 # entities. We quote the double quotes rather than

244 # the single quotes because the entity name is

245 # """ whether this is HTML or XML. If we

246 # quoted the single quotes, we'd have to decide

247 # between ' and &squot;.

248 replace_with = """

249 value = value.replace('"', replace_with)

250 else:

251 # There are double quotes but no single quotes.

252 # We can use single quotes to quote the attribute.

253 quote_with = "'"

254 return quote_with + value + quote_with

255

256 @classmethod

257 def substitute_xml(cls, value, make_quoted_attribute=False):

258 """Substitute XML entities for special XML characters.

259

260 :param value: A string to be substituted. The less-than sign

261 will become <, the greater-than sign will become >,

262 and any ampersands will become &. If you want ampersands

263 that appear to be part of an entity definition to be left

264 alone, use substitute_xml_containing_entities() instead.

265

266 :param make_quoted_attribute: If True, then the string will be

267 quoted, as befits an attribute value.

268 """

269 # Escape angle brackets and ampersands.

270 value = cls.AMPERSAND_OR_BRACKET.sub(

271 cls._substitute_xml_entity, value)

272

273 if make_quoted_attribute:

274 value = cls.quoted_attribute_value(value)

275 return value

276

277 @classmethod

278 def substitute_xml_containing_entities(

279 cls, value, make_quoted_attribute=False):

280 """Substitute XML entities for special XML characters.

281

282 :param value: A string to be substituted. The less-than sign will

283 become <, the greater-than sign will become >, and any

284 ampersands that are not part of an entity defition will

285 become &.

286

287 :param make_quoted_attribute: If True, then the string will be

288 quoted, as befits an attribute value.

289 """

290 # Escape angle brackets, and ampersands that aren't part of

291 # entities.

292 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(

293 cls._substitute_xml_entity, value)

294

295 if make_quoted_attribute:

296 value = cls.quoted_attribute_value(value)

297 return value

298

299 @classmethod

300 def substitute_html(cls, s):

301 """Replace certain Unicode characters with named HTML entities.

302

303 This differs from data.encode(encoding, 'xmlcharrefreplace')

304 in that the goal is to make the result more readable (to those

305 with ASCII displays) rather than to recover from

306 errors. There's absolutely nothing wrong with a UTF-8 string

307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that

308 character with "é" will make it more readable to some

309 people.

310

311 :param s: A Unicode string.

312 """

313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(

314 cls._substitute_html_entity, s)

315

316

317class EncodingDetector:

318 """Suggests a number of possible encodings for a bytestring.

319

320 Order of precedence:

321

322 1. Encodings you specifically tell EncodingDetector to try first

323 (the known_definite_encodings argument to the constructor).

324

325 2. An encoding determined by sniffing the document's byte-order mark.

326

327 3. Encodings you specifically tell EncodingDetector to try if

328 byte-order mark sniffing fails (the user_encodings argument to the

329 constructor).

330

331 4. An encoding declared within the bytestring itself, either in an

332 XML declaration (if the bytestring is to be interpreted as an XML

333 document), or in a <meta> tag (if the bytestring is to be

334 interpreted as an HTML document.)

335

336 5. An encoding detected through textual analysis by chardet,

337 cchardet, or a similar external library.

338

339 4. UTF-8.

340

341 5. Windows-1252.

342

343 """

344 def __init__(self, markup, known_definite_encodings=None,

345 is_html=False, exclude_encodings=None,

346 user_encodings=None, override_encodings=None):

347 """Constructor.

348

349 :param markup: Some markup in an unknown encoding.

350

351 :param known_definite_encodings: When determining the encoding

352 of `markup`, these encodings will be tried first, in

353 order. In HTML terms, this corresponds to the "known

354 definite encoding" step defined here:

355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding

356

357 :param user_encodings: These encodings will be tried after the

358 `known_definite_encodings` have been tried and failed, and

359 after an attempt to sniff the encoding by looking at a

360 byte order mark has failed. In HTML terms, this

361 corresponds to the step "user has explicitly instructed

362 the user agent to override the document's character

363 encoding", defined here:

364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding

365

366 :param override_encodings: A deprecated alias for

367 known_definite_encodings. Any encodings here will be tried

368 immediately after the encodings in

369 known_definite_encodings.

370

371 :param is_html: If True, this markup is considered to be

372 HTML. Otherwise it's assumed to be XML.

373

374 :param exclude_encodings: These encodings will not be tried,

375 even if they otherwise would be.

376

377 """

378 self.known_definite_encodings = list(known_definite_encodings or [])

379 if override_encodings:

380 self.known_definite_encodings += override_encodings

381 self.user_encodings = user_encodings or []

382 exclude_encodings = exclude_encodings or []

383 self.exclude_encodings = set([x.lower() for x in exclude_encodings])

384 self.chardet_encoding = None

385 self.is_html = is_html

386 self.declared_encoding = None

387

388 # First order of business: strip a byte-order mark.

389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)

390

391 def _usable(self, encoding, tried):

392 """Should we even bother to try this encoding?

393

394 :param encoding: Name of an encoding.

395 :param tried: Encodings that have already been tried. This will be modified

396 as a side effect.

397 """

398 if encoding is not None:

399 encoding = encoding.lower()

400 if encoding in self.exclude_encodings:

401 return False

402 if encoding not in tried:

403 tried.add(encoding)

404 return True

405 return False

406

407 @property

408 def encodings(self):

409 """Yield a number of encodings that might work for this markup.

410

411 :yield: A sequence of strings.

412 """

413 tried = set()

414

415 # First, try the known definite encodings

416 for e in self.known_definite_encodings:

417 if self._usable(e, tried):

418 yield e

419

420 # Did the document originally start with a byte-order mark

421 # that indicated its encoding?

422 if self._usable(self.sniffed_encoding, tried):

423 yield self.sniffed_encoding

424

425 # Sniffing the byte-order mark did nothing; try the user

426 # encodings.

427 for e in self.user_encodings:

428 if self._usable(e, tried):

429 yield e

430

431 # Look within the document for an XML or HTML encoding

432 # declaration.

433 if self.declared_encoding is None:

434 self.declared_encoding = self.find_declared_encoding(

435 self.markup, self.is_html)

436 if self._usable(self.declared_encoding, tried):

437 yield self.declared_encoding

438

439 # Use third-party character set detection to guess at the

440 # encoding.

441 if self.chardet_encoding is None:

442 self.chardet_encoding = chardet_dammit(self.markup)

443 if self._usable(self.chardet_encoding, tried):

444 yield self.chardet_encoding

445

446 # As a last-ditch effort, try utf-8 and windows-1252.

447 for e in ('utf-8', 'windows-1252'):

448 if self._usable(e, tried):

449 yield e

450

451 @classmethod

452 def strip_byte_order_mark(cls, data):

453 """If a byte-order mark is present, strip it and return the encoding it implies.

454

455 :param data: Some markup.

456 :return: A 2-tuple (modified data, implied encoding)

457 """

458 encoding = None

459 if isinstance(data, str):

460 # Unicode data cannot have a byte-order mark.

461 return data, encoding

462 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \

463 and (data[2:4] != '\x00\x00'):

464 encoding = 'utf-16be'

465 data = data[2:]

466 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \

467 and (data[2:4] != '\x00\x00'):

468 encoding = 'utf-16le'

469 data = data[2:]

470 elif data[:3] == b'\xef\xbb\xbf':

471 encoding = 'utf-8'

472 data = data[3:]

473 elif data[:4] == b'\x00\x00\xfe\xff':

474 encoding = 'utf-32be'

475 data = data[4:]

476 elif data[:4] == b'\xff\xfe\x00\x00':

477 encoding = 'utf-32le'

478 data = data[4:]

479 return data, encoding

480

481 @classmethod

482 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):

483 """Given a document, tries to find its declared encoding.

484

485 An XML encoding is declared at the beginning of the document.

486

487 An HTML encoding is declared in a <meta> tag, hopefully near the

488 beginning of the document.

489

490 :param markup: Some markup.

491 :param is_html: If True, this markup is considered to be HTML. Otherwise

492 it's assumed to be XML.

493 :param search_entire_document: Since an encoding is supposed to declared near the beginning

494 of the document, most of the time it's only necessary to search a few kilobytes of data.

495 Set this to True to force this method to search the entire document.

496 """

497 if search_entire_document:

498 xml_endpos = html_endpos = len(markup)

499 else:

500 xml_endpos = 1024

501 html_endpos = max(2048, int(len(markup) * 0.05))

502

503 if isinstance(markup, bytes):

504 res = encoding_res[bytes]

505 else:

506 res = encoding_res[str]

507

508 xml_re = res['xml']

509 html_re = res['html']

510 declared_encoding = None

511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)

512 if not declared_encoding_match and is_html:

513 declared_encoding_match = html_re.search(markup, endpos=html_endpos)

514 if declared_encoding_match is not None:

515 declared_encoding = declared_encoding_match.groups()[0]

516 if declared_encoding:

517 if isinstance(declared_encoding, bytes):

518 declared_encoding = declared_encoding.decode('ascii', 'replace')

519 return declared_encoding.lower()

520 return None

521

522class UnicodeDammit:

523 """A class for detecting the encoding of a *ML document and

524 converting it to a Unicode string. If the source encoding is

525 windows-1252, can replace MS smart quotes with their HTML or XML

526 equivalents."""

527

528 # This dictionary maps commonly seen values for "charset" in HTML

529 # meta tags to the corresponding Python codec names. It only covers

530 # values that aren't in Python's aliases and can't be determined

531 # by the heuristics in find_codec.

532 CHARSET_ALIASES = {"macintosh": "mac-roman",

533 "x-sjis": "shift-jis"}

534

535 ENCODINGS_WITH_SMART_QUOTES = [

536 "windows-1252",

537 "iso-8859-1",

538 "iso-8859-2",

539 ]

540

541 def __init__(self, markup, known_definite_encodings=[],

542 smart_quotes_to=None, is_html=False, exclude_encodings=[],

543 user_encodings=None, override_encodings=None

544 ):

545 """Constructor.

546

547 :param markup: A bytestring representing markup in an unknown encoding.

548

549 :param known_definite_encodings: When determining the encoding

550 of `markup`, these encodings will be tried first, in

551 order. In HTML terms, this corresponds to the "known

552 definite encoding" step defined here:

553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding

554

555 :param user_encodings: These encodings will be tried after the

556 `known_definite_encodings` have been tried and failed, and

557 after an attempt to sniff the encoding by looking at a

558 byte order mark has failed. In HTML terms, this

559 corresponds to the step "user has explicitly instructed

560 the user agent to override the document's character

561 encoding", defined here:

562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding

563

564 :param override_encodings: A deprecated alias for

565 known_definite_encodings. Any encodings here will be tried

566 immediately after the encodings in

567 known_definite_encodings.

568

569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted

570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.

571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'

572 will convert them to HTML entity references.

573 :param is_html: If True, this markup is considered to be HTML. Otherwise

574 it's assumed to be XML.

575 :param exclude_encodings: These encodings will not be considered, even

576 if the sniffing code thinks they might make sense.

577

578 """

579 self.smart_quotes_to = smart_quotes_to

580 self.tried_encodings = []

581 self.contains_replacement_characters = False

582 self.is_html = is_html

583 self.log = logging.getLogger(__name__)

584 self.detector = EncodingDetector(

585 markup, known_definite_encodings, is_html, exclude_encodings,

586 user_encodings, override_encodings

587 )

588

589 # Short-circuit if the data is in Unicode to begin with.

590 if isinstance(markup, str) or markup == '':

591 self.markup = markup

592 self.unicode_markup = str(markup)

593 self.original_encoding = None

594 return

595

596 # The encoding detector may have stripped a byte-order mark.

597 # Use the stripped markup from this point on.

598 self.markup = self.detector.markup

599

600 u = None

601 for encoding in self.detector.encodings:

602 markup = self.detector.markup

603 u = self._convert_from(encoding)

604 if u is not None:

605 break

606

607 if not u:

608 # None of the encodings worked. As an absolute last resort,

609 # try them again with character replacement.

610

611 for encoding in self.detector.encodings:

612 if encoding != "ascii":

613 u = self._convert_from(encoding, "replace")

614 if u is not None:

615 self.log.warning(

616 "Some characters could not be decoded, and were "

617 "replaced with REPLACEMENT CHARACTER."

618 )

619 self.contains_replacement_characters = True

620 break

621

622 # If none of that worked, we could at this point force it to

623 # ASCII, but that would destroy so much data that I think

624 # giving up is better.

625 self.unicode_markup = u

626 if not u:

627 self.original_encoding = None

628

629 def _sub_ms_char(self, match):

630 """Changes a MS smart quote character to an XML or HTML

631 entity, or an ASCII character."""

632 orig = match.group(1)

633 if self.smart_quotes_to == 'ascii':

634 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()

635 else:

636 sub = self.MS_CHARS.get(orig)

637 if type(sub) == tuple:

638 if self.smart_quotes_to == 'xml':

639 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()

640 else:

641 sub = '&'.encode() + sub[0].encode() + ';'.encode()

642 else:

643 sub = sub.encode()

644 return sub

645

646 def _convert_from(self, proposed, errors="strict"):

647 """Attempt to convert the markup to the proposed encoding.

648

649 :param proposed: The name of a character encoding.

650 """

651 proposed = self.find_codec(proposed)

652 if not proposed or (proposed, errors) in self.tried_encodings:

653 return None

654 self.tried_encodings.append((proposed, errors))

655 markup = self.markup

656 # Convert smart quotes to HTML if coming from an encoding

657 # that might have them.

658 if (self.smart_quotes_to is not None

659 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):

660 smart_quotes_re = b"([\x80-\x9f])"

661 smart_quotes_compiled = re.compile(smart_quotes_re)

662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)

663

664 try:

665 #print("Trying to convert document to %s (errors=%s)" % (

666 # proposed, errors))

667 u = self._to_unicode(markup, proposed, errors)

668 self.markup = u

669 self.original_encoding = proposed

670 except Exception as e:

671 #print("That didn't work!")

672 #print(e)

673 return None

674 #print("Correct encoding: %s" % proposed)

675 return self.markup

676

677 def _to_unicode(self, data, encoding, errors="strict"):

678 """Given a string and its encoding, decodes the string into Unicode.

679

680 :param encoding: The name of an encoding.

681 """

682 return str(data, encoding, errors)

683

684 @property

685 def declared_html_encoding(self):

686 """If the markup is an HTML document, returns the encoding declared _within_

687 the document.

688 """

689 if not self.is_html:

690 return None

691 return self.detector.declared_encoding

692

693 def find_codec(self, charset):

694 """Convert the name of a character set to a codec name.

695

696 :param charset: The name of a character set.

697 :return: The name of a codec.

698 """

699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))

700 or (charset and self._codec(charset.replace("-", "")))

701 or (charset and self._codec(charset.replace("-", "_")))

702 or (charset and charset.lower())

703 or charset

704 )

705 if value:

706 return value.lower()

707 return None

708

709 def _codec(self, charset):

710 if not charset:

711 return charset

712 codec = None

713 try:

714 codecs.lookup(charset)

715 codec = charset

716 except (LookupError, ValueError):

717 pass

718 return codec

719

720

721 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.

722 MS_CHARS = {b'\x80': ('euro', '20AC'),

723 b'\x81': ' ',

724 b'\x82': ('sbquo', '201A'),

725 b'\x83': ('fnof', '192'),

726 b'\x84': ('bdquo', '201E'),

727 b'\x85': ('hellip', '2026'),

728 b'\x86': ('dagger', '2020'),

729 b'\x87': ('Dagger', '2021'),

730 b'\x88': ('circ', '2C6'),

731 b'\x89': ('permil', '2030'),

732 b'\x8A': ('Scaron', '160'),

733 b'\x8B': ('lsaquo', '2039'),

734 b'\x8C': ('OElig', '152'),

735 b'\x8D': '?',

736 b'\x8E': ('#x17D', '17D'),

737 b'\x8F': '?',

738 b'\x90': '?',

739 b'\x91': ('lsquo', '2018'),

740 b'\x92': ('rsquo', '2019'),

741 b'\x93': ('ldquo', '201C'),

742 b'\x94': ('rdquo', '201D'),

743 b'\x95': ('bull', '2022'),

744 b'\x96': ('ndash', '2013'),

745 b'\x97': ('mdash', '2014'),

746 b'\x98': ('tilde', '2DC'),

747 b'\x99': ('trade', '2122'),

748 b'\x9a': ('scaron', '161'),

749 b'\x9b': ('rsaquo', '203A'),

750 b'\x9c': ('oelig', '153'),

751 b'\x9d': '?',

752 b'\x9e': ('#x17E', '17E'),

753 b'\x9f': ('Yuml', ''),}

754

755 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains

756 # horrors like stripping diacritical marks to turn á into a, but also

757 # contains non-horrors like turning “ into ".

758 MS_CHARS_TO_ASCII = {

759 b'\x80' : 'EUR',

760 b'\x81' : ' ',

761 b'\x82' : ',',

762 b'\x83' : 'f',

763 b'\x84' : ',,',

764 b'\x85' : '...',

765 b'\x86' : '+',

766 b'\x87' : '++',

767 b'\x88' : '^',

768 b'\x89' : '%',

769 b'\x8a' : 'S',

770 b'\x8b' : '<',

771 b'\x8c' : 'OE',

772 b'\x8d' : '?',

773 b'\x8e' : 'Z',

774 b'\x8f' : '?',

775 b'\x90' : '?',

776 b'\x91' : "'",

777 b'\x92' : "'",

778 b'\x93' : '"',

779 b'\x94' : '"',

780 b'\x95' : '*',

781 b'\x96' : '-',

782 b'\x97' : '--',

783 b'\x98' : '~',

784 b'\x99' : '(TM)',

785 b'\x9a' : 's',

786 b'\x9b' : '>',

787 b'\x9c' : 'oe',

788 b'\x9d' : '?',

789 b'\x9e' : 'z',

790 b'\x9f' : 'Y',

791 b'\xa0' : ' ',

792 b'\xa1' : '!',

793 b'\xa2' : 'c',

794 b'\xa3' : 'GBP',

795 b'\xa4' : '$', #This approximation is especially parochial--this is the

796 #generic currency symbol.

797 b'\xa5' : 'YEN',

798 b'\xa6' : '|',

799 b'\xa7' : 'S',

800 b'\xa8' : '..',

801 b'\xa9' : '',

802 b'\xaa' : '(th)',

803 b'\xab' : '<<',

804 b'\xac' : '!',

805 b'\xad' : ' ',

806 b'\xae' : '(R)',

807 b'\xaf' : '-',

808 b'\xb0' : 'o',

809 b'\xb1' : '+-',

810 b'\xb2' : '2',

811 b'\xb3' : '3',

812 b'\xb4' : ("'", 'acute'),

813 b'\xb5' : 'u',

814 b'\xb6' : 'P',

815 b'\xb7' : '*',

816 b'\xb8' : ',',

817 b'\xb9' : '1',

818 b'\xba' : '(th)',

819 b'\xbb' : '>>',

820 b'\xbc' : '1/4',

821 b'\xbd' : '1/2',

822 b'\xbe' : '3/4',

823 b'\xbf' : '?',

824 b'\xc0' : 'A',

825 b'\xc1' : 'A',

826 b'\xc2' : 'A',

827 b'\xc3' : 'A',

828 b'\xc4' : 'A',

829 b'\xc5' : 'A',

830 b'\xc6' : 'AE',

831 b'\xc7' : 'C',

832 b'\xc8' : 'E',

833 b'\xc9' : 'E',

834 b'\xca' : 'E',

835 b'\xcb' : 'E',

836 b'\xcc' : 'I',

837 b'\xcd' : 'I',

838 b'\xce' : 'I',

839 b'\xcf' : 'I',

840 b'\xd0' : 'D',

841 b'\xd1' : 'N',

842 b'\xd2' : 'O',

843 b'\xd3' : 'O',

844 b'\xd4' : 'O',

845 b'\xd5' : 'O',

846 b'\xd6' : 'O',

847 b'\xd7' : '*',

848 b'\xd8' : 'O',

849 b'\xd9' : 'U',

850 b'\xda' : 'U',

851 b'\xdb' : 'U',

852 b'\xdc' : 'U',

853 b'\xdd' : 'Y',

854 b'\xde' : 'b',

855 b'\xdf' : 'B',

856 b'\xe0' : 'a',

857 b'\xe1' : 'a',

858 b'\xe2' : 'a',

859 b'\xe3' : 'a',

860 b'\xe4' : 'a',

861 b'\xe5' : 'a',

862 b'\xe6' : 'ae',

863 b'\xe7' : 'c',

864 b'\xe8' : 'e',

865 b'\xe9' : 'e',

866 b'\xea' : 'e',

867 b'\xeb' : 'e',

868 b'\xec' : 'i',

869 b'\xed' : 'i',

870 b'\xee' : 'i',

871 b'\xef' : 'i',

872 b'\xf0' : 'o',

873 b'\xf1' : 'n',

874 b'\xf2' : 'o',

875 b'\xf3' : 'o',

876 b'\xf4' : 'o',

877 b'\xf5' : 'o',

878 b'\xf6' : 'o',

879 b'\xf7' : '/',

880 b'\xf8' : 'o',

881 b'\xf9' : 'u',

882 b'\xfa' : 'u',

883 b'\xfb' : 'u',

884 b'\xfc' : 'u',

885 b'\xfd' : 'y',

886 b'\xfe' : 'b',

887 b'\xff' : 'y',

888 }

889

890 # A map used when removing rogue Windows-1252/ISO-8859-1

891 # characters in otherwise UTF-8 documents.

892 #

893 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in

894 # Windows-1252.

895 WINDOWS_1252_TO_UTF8 = {

896 0x80 : b'\xe2\x82\xac', # €

897 0x82 : b'\xe2\x80\x9a', # ‚

898 0x83 : b'\xc6\x92', # ƒ

899 0x84 : b'\xe2\x80\x9e', # „

900 0x85 : b'\xe2\x80\xa6', # …

901 0x86 : b'\xe2\x80\xa0', # †

902 0x87 : b'\xe2\x80\xa1', # ‡

903 0x88 : b'\xcb\x86', # ˆ

904 0x89 : b'\xe2\x80\xb0', # ‰

905 0x8a : b'\xc5\xa0', # Š

906 0x8b : b'\xe2\x80\xb9', # ‹

907 0x8c : b'\xc5\x92', # Œ

908 0x8e : b'\xc5\xbd', # Ž

909 0x91 : b'\xe2\x80\x98', # ‘

910 0x92 : b'\xe2\x80\x99', # ’

911 0x93 : b'\xe2\x80\x9c', # “

912 0x94 : b'\xe2\x80\x9d', # ”

913 0x95 : b'\xe2\x80\xa2', # •

914 0x96 : b'\xe2\x80\x93', # –

915 0x97 : b'\xe2\x80\x94', # —

916 0x98 : b'\xcb\x9c', # ˜

917 0x99 : b'\xe2\x84\xa2', # ™

918 0x9a : b'\xc5\xa1', # š

919 0x9b : b'\xe2\x80\xba', # ›

920 0x9c : b'\xc5\x93', # œ

921 0x9e : b'\xc5\xbe', # ž

922 0x9f : b'\xc5\xb8', # Ÿ

923 0xa0 : b'\xc2\xa0', #

924 0xa1 : b'\xc2\xa1', # ¡

925 0xa2 : b'\xc2\xa2', # ¢

926 0xa3 : b'\xc2\xa3', # £

927 0xa4 : b'\xc2\xa4', # ¤

928 0xa5 : b'\xc2\xa5', # ¥

929 0xa6 : b'\xc2\xa6', # ¦

930 0xa7 : b'\xc2\xa7', # §

931 0xa8 : b'\xc2\xa8', # ¨

933 0xaa : b'\xc2\xaa', # ª

934 0xab : b'\xc2\xab', # «

935 0xac : b'\xc2\xac', # ¬

936 0xad : b'\xc2\xad', #

937 0xae : b'\xc2\xae', # ®

938 0xaf : b'\xc2\xaf', # ¯

939 0xb0 : b'\xc2\xb0', # °

940 0xb1 : b'\xc2\xb1', # ±

941 0xb2 : b'\xc2\xb2', # ²

942 0xb3 : b'\xc2\xb3', # ³

943 0xb4 : b'\xc2\xb4', # ´

944 0xb5 : b'\xc2\xb5', # µ

945 0xb6 : b'\xc2\xb6', # ¶

946 0xb7 : b'\xc2\xb7', # ·

947 0xb8 : b'\xc2\xb8', # ¸

948 0xb9 : b'\xc2\xb9', # ¹

949 0xba : b'\xc2\xba', # º

950 0xbb : b'\xc2\xbb', # »

951 0xbc : b'\xc2\xbc', # ¼

952 0xbd : b'\xc2\xbd', # ½

953 0xbe : b'\xc2\xbe', # ¾

954 0xbf : b'\xc2\xbf', # ¿

955 0xc0 : b'\xc3\x80', # À

956 0xc1 : b'\xc3\x81', # Á

957 0xc2 : b'\xc3\x82', # Â

958 0xc3 : b'\xc3\x83', # Ã

959 0xc4 : b'\xc3\x84', # Ä

960 0xc5 : b'\xc3\x85', # Å

961 0xc6 : b'\xc3\x86', # Æ

962 0xc7 : b'\xc3\x87', # Ç

963 0xc8 : b'\xc3\x88', # È

964 0xc9 : b'\xc3\x89', # É

965 0xca : b'\xc3\x8a', # Ê

966 0xcb : b'\xc3\x8b', # Ë

967 0xcc : b'\xc3\x8c', # Ì

968 0xcd : b'\xc3\x8d', # Í

969 0xce : b'\xc3\x8e', # Î

970 0xcf : b'\xc3\x8f', # Ï

971 0xd0 : b'\xc3\x90', # Ð

972 0xd1 : b'\xc3\x91', # Ñ

973 0xd2 : b'\xc3\x92', # Ò

974 0xd3 : b'\xc3\x93', # Ó

975 0xd4 : b'\xc3\x94', # Ô

976 0xd5 : b'\xc3\x95', # Õ

977 0xd6 : b'\xc3\x96', # Ö

978 0xd7 : b'\xc3\x97', # ×

979 0xd8 : b'\xc3\x98', # Ø

980 0xd9 : b'\xc3\x99', # Ù

981 0xda : b'\xc3\x9a', # Ú

982 0xdb : b'\xc3\x9b', # Û

983 0xdc : b'\xc3\x9c', # Ü

984 0xdd : b'\xc3\x9d', # Ý

985 0xde : b'\xc3\x9e', # Þ

986 0xdf : b'\xc3\x9f', # ß

987 0xe0 : b'\xc3\xa0', # à

988 0xe1 : b'\xa1', # á

989 0xe2 : b'\xc3\xa2', # â

990 0xe3 : b'\xc3\xa3', # ã

991 0xe4 : b'\xc3\xa4', # ä

992 0xe5 : b'\xc3\xa5', # å

993 0xe6 : b'\xc3\xa6', # æ

994 0xe7 : b'\xc3\xa7', # ç

995 0xe8 : b'\xc3\xa8', # è

996 0xe9 : b'\xc3\xa9', # é

997 0xea : b'\xc3\xaa', # ê

998 0xeb : b'\xc3\xab', # ë

999 0xec : b'\xc3\xac', # ì

1000 0xed : b'\xc3\xad', # í

1001 0xee : b'\xc3\xae', # î

1002 0xef : b'\xc3\xaf', # ï

1003 0xf0 : b'\xc3\xb0', # ð

1004 0xf1 : b'\xc3\xb1', # ñ

1005 0xf2 : b'\xc3\xb2', # ò

1006 0xf3 : b'\xc3\xb3', # ó

1007 0xf4 : b'\xc3\xb4', # ô

1008 0xf5 : b'\xc3\xb5', # õ

1009 0xf6 : b'\xc3\xb6', # ö

1010 0xf7 : b'\xc3\xb7', # ÷

1011 0xf8 : b'\xc3\xb8', # ø

1012 0xf9 : b'\xc3\xb9', # ù

1013 0xfa : b'\xc3\xba', # ú

1014 0xfb : b'\xc3\xbb', # û

1015 0xfc : b'\xc3\xbc', # ü

1016 0xfd : b'\xc3\xbd', # ý

1017 0xfe : b'\xc3\xbe', # þ

1018 }

1019

1020 MULTIBYTE_MARKERS_AND_SIZES = [

1021 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF

1022 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF

1023 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4

1024 ]

1025

1026 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]

1027 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]

1028

1029 @classmethod

1030 def detwingle(cls, in_bytes, main_encoding="utf8",

1031 embedded_encoding="windows-1252"):

1032 """Fix characters from one encoding embedded in some other encoding.

1033

1034 Currently the only situation supported is Windows-1252 (or its

1035 subset ISO-8859-1), embedded in UTF-8.

1036

1037 :param in_bytes: A bytestring that you suspect contains

1038 characters from multiple encodings. Note that this _must_

1039 be a bytestring. If you've already converted the document

1040 to Unicode, you're too late.

1041 :param main_encoding: The primary encoding of `in_bytes`.

1042 :param embedded_encoding: The encoding that was used to embed characters

1043 in the main document.

1044 :return: A bytestring in which `embedded_encoding`

1045 characters have been converted to their `main_encoding`

1046 equivalents.

1047 """

1048 if embedded_encoding.replace('_', '-').lower() not in (

1049 'windows-1252', 'windows_1252'):

1050 raise NotImplementedError(

1051 "Windows-1252 and ISO-8859-1 are the only currently supported "

1052 "embedded encodings.")

1053

1054 if main_encoding.lower() not in ('utf8', 'utf-8'):

1055 raise NotImplementedError(

1056 "UTF-8 is the only currently supported main encoding.")

1057

1058 byte_chunks = []

1059

1060 chunk_start = 0

1061 pos = 0

1062 while pos < len(in_bytes):

1063 byte = in_bytes[pos]

1064 if not isinstance(byte, int):

1065 # Python 2.x

1066 byte = ord(byte)

1067 if (byte >= cls.FIRST_MULTIBYTE_MARKER

1068 and byte <= cls.LAST_MULTIBYTE_MARKER):

1069 # This is the start of a UTF-8 multibyte character. Skip

1070 # to the end.

1071 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:

1072 if byte >= start and byte <= end:

1073 pos += size

1074 break

1075 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:

1076 # We found a Windows-1252 character!

1077 # Save the string up to this point as a chunk.

1078 byte_chunks.append(in_bytes[chunk_start:pos])

1079

1080 # Now translate the Windows-1252 character into UTF-8

1081 # and add it as another, one-byte chunk.

1082 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])

1083 pos += 1

1084 chunk_start = pos

1085 else:

1086 # Go on to the next character.

1087 pos += 1

1088 if chunk_start == 0:

1089 # The string is unchanged.

1090 return in_bytes

1091 else:

1092 # Store the final chunk.

1093 byte_chunks.append(in_bytes[chunk_start:])

1094 return b''.join(byte_chunks)

1095