Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/dammit.py: 35%

306 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# -*- coding: utf-8 -*- 

2"""Beautiful Soup bonus library: Unicode, Dammit 

3 

4This library converts a bytestream to Unicode through any means 

5necessary. It is heavily based on code from Mark Pilgrim's Universal 

6Feed Parser. It works best on XML and HTML, but it does not rewrite the 

7XML or HTML to reflect a new encoding; that's the tree builder's job. 

8""" 

9# Use of this source code is governed by the MIT license. 

10__license__ = "MIT" 

11 

12from html.entities import codepoint2name 

13from collections import defaultdict 

14import codecs 

15import re 

16import logging 

17import string 

18 

19# Import a library to autodetect character encodings. We'll support 

20# any of a number of libraries that all support the same API: 

21# 

22# * cchardet 

23# * chardet 

24# * charset-normalizer 

25chardet_module = None 

26try: 

27 # PyPI package: cchardet 

28 import cchardet as chardet_module 

29except ImportError: 

30 try: 

31 # Debian package: python-chardet 

32 # PyPI package: chardet 

33 import chardet as chardet_module 

34 except ImportError: 

35 try: 

36 # PyPI package: charset-normalizer 

37 import charset_normalizer as chardet_module 

38 except ImportError: 

39 # No chardet available. 

40 chardet_module = None 

41 

42if chardet_module: 

43 def chardet_dammit(s): 

44 if isinstance(s, str): 

45 return None 

46 return chardet_module.detect(s)['encoding'] 

47else: 

48 def chardet_dammit(s): 

49 return None 

50 

51# Build bytestring and Unicode versions of regular expressions for finding 

52# a declared encoding inside an XML or HTML document. 

53xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' 

54html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' 

55encoding_res = dict() 

56encoding_res[bytes] = { 

57 'html' : re.compile(html_meta.encode("ascii"), re.I), 

58 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), 

59} 

60encoding_res[str] = { 

61 'html' : re.compile(html_meta, re.I), 

62 'xml' : re.compile(xml_encoding, re.I) 

63} 

64 

65from html.entities import html5 

66 

67class EntitySubstitution(object): 

68 """The ability to substitute XML or HTML entities for certain characters.""" 

69 

70 def _populate_class_variables(): 

71 """Initialize variables used by this class to manage the plethora of 

72 HTML5 named entities. 

73 

74 This function returns a 3-tuple containing two dictionaries 

75 and a regular expression: 

76 

77 unicode_to_name - A mapping of Unicode strings like "⦨" to 

78 entity names like "angmsdaa". When a single Unicode string has 

79 multiple entity names, we try to choose the most commonly-used 

80 name. 

81 

82 name_to_unicode: A mapping of entity names like "angmsdaa" to  

83 Unicode strings like "⦨". 

84 

85 named_entity_re: A regular expression matching (almost) any 

86 Unicode string that corresponds to an HTML5 named entity. 

87 """ 

88 unicode_to_name = {} 

89 name_to_unicode = {} 

90 

91 short_entities = set() 

92 long_entities_by_first_character = defaultdict(set) 

93 

94 for name_with_semicolon, character in sorted(html5.items()): 

95 # "It is intentional, for legacy compatibility, that many 

96 # code points have multiple character reference names. For 

97 # example, some appear both with and without the trailing 

98 # semicolon, or with different capitalizations." 

99 # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references 

100 # 

101 # The parsers are in charge of handling (or not) character 

102 # references with no trailing semicolon, so we remove the 

103 # semicolon whenever it appears. 

104 if name_with_semicolon.endswith(';'): 

105 name = name_with_semicolon[:-1] 

106 else: 

107 name = name_with_semicolon 

108 

109 # When parsing HTML, we want to recognize any known named 

110 # entity and convert it to a sequence of Unicode 

111 # characters. 

112 if name not in name_to_unicode: 

113 name_to_unicode[name] = character 

114 

115 # When _generating_ HTML, we want to recognize special 

116 # character sequences that _could_ be converted to named 

117 # entities. 

118 unicode_to_name[character] = name 

119 

120 # We also need to build a regular expression that lets us 

121 # _find_ those characters in output strings so we can 

122 # replace them. 

123 # 

124 # This is tricky, for two reasons. 

125 

126 if (len(character) == 1 and ord(character) < 128 

127 and character not in '<>&'): 

128 # First, it would be annoying to turn single ASCII 

129 # characters like | into named entities like 

130 # &verbar;. The exceptions are <>&, which we _must_ 

131 # turn into named entities to produce valid HTML. 

132 continue 

133 

134 if len(character) > 1 and all(ord(x) < 128 for x in character): 

135 # We also do not want to turn _combinations_ of ASCII 

136 # characters like 'fj' into named entities like '&fjlig;', 

137 # though that's more debateable. 

138 continue 

139 

140 # Second, some named entities have a Unicode value that's 

141 # a subset of the Unicode value for some _other_ named 

142 # entity. As an example, \u2267' is &GreaterFullEqual;, 

143 # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular 

144 # expression needs to match the first two characters of 

145 # "\u2267\u0338foo", but only the first character of 

146 # "\u2267foo". 

147 # 

148 # In this step, we build two sets of characters that 

149 # _eventually_ need to go into the regular expression. But 

150 # we won't know exactly what the regular expression needs 

151 # to look like until we've gone through the entire list of 

152 # named entities. 

153 if len(character) == 1: 

154 short_entities.add(character) 

155 else: 

156 long_entities_by_first_character[character[0]].add(character) 

157 

158 # Now that we've been through the entire list of entities, we 

159 # can create a regular expression that matches any of them. 

160 particles = set() 

161 for short in short_entities: 

162 long_versions = long_entities_by_first_character[short] 

163 if not long_versions: 

164 particles.add(short) 

165 else: 

166 ignore = "".join([x[1] for x in long_versions]) 

167 # This finds, e.g. \u2267 but only if it is _not_ 

168 # followed by \u0338. 

169 particles.add("%s(?![%s])" % (short, ignore)) 

170 

171 for long_entities in list(long_entities_by_first_character.values()): 

172 for long_entity in long_entities: 

173 particles.add(long_entity) 

174 

175 re_definition = "(%s)" % "|".join(particles) 

176 

177 # If an entity shows up in both html5 and codepoint2name, it's 

178 # likely that HTML5 gives it several different names, such as 

179 # 'rsquo' and 'rsquor'. When converting Unicode characters to 

180 # named entities, the codepoint2name name should take 

181 # precedence where possible, since that's the more easily 

182 # recognizable one. 

183 for codepoint, name in list(codepoint2name.items()): 

184 character = chr(codepoint) 

185 unicode_to_name[character] = name 

186 

187 return unicode_to_name, name_to_unicode, re.compile(re_definition) 

188 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 

189 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 

190 

191 CHARACTER_TO_XML_ENTITY = { 

192 "'": "apos", 

193 '"': "quot", 

194 "&": "amp", 

195 "<": "lt", 

196 ">": "gt", 

197 } 

198 

199 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 

200 "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" 

201 ")") 

202 

203 AMPERSAND_OR_BRACKET = re.compile("([<>&])") 

204 

205 @classmethod 

206 def _substitute_html_entity(cls, matchobj): 

207 """Used with a regular expression to substitute the 

208 appropriate HTML entity for a special character string.""" 

209 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 

210 return "&%s;" % entity 

211 

212 @classmethod 

213 def _substitute_xml_entity(cls, matchobj): 

214 """Used with a regular expression to substitute the 

215 appropriate XML entity for a special character string.""" 

216 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 

217 return "&%s;" % entity 

218 

219 @classmethod 

220 def quoted_attribute_value(self, value): 

221 """Make a value into a quoted XML attribute, possibly escaping it. 

222 

223 Most strings will be quoted using double quotes. 

224 

225 Bob's Bar -> "Bob's Bar" 

226 

227 If a string contains double quotes, it will be quoted using 

228 single quotes. 

229 

230 Welcome to "my bar" -> 'Welcome to "my bar"' 

231 

232 If a string contains both single and double quotes, the 

233 double quotes will be escaped, and the string will be quoted 

234 using double quotes. 

235 

236 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot; 

237 """ 

238 quote_with = '"' 

239 if '"' in value: 

240 if "'" in value: 

241 # The string contains both single and double 

242 # quotes. Turn the double quotes into 

243 # entities. We quote the double quotes rather than 

244 # the single quotes because the entity name is 

245 # "&quot;" whether this is HTML or XML. If we 

246 # quoted the single quotes, we'd have to decide 

247 # between &apos; and &squot;. 

248 replace_with = "&quot;" 

249 value = value.replace('"', replace_with) 

250 else: 

251 # There are double quotes but no single quotes. 

252 # We can use single quotes to quote the attribute. 

253 quote_with = "'" 

254 return quote_with + value + quote_with 

255 

256 @classmethod 

257 def substitute_xml(cls, value, make_quoted_attribute=False): 

258 """Substitute XML entities for special XML characters. 

259 

260 :param value: A string to be substituted. The less-than sign 

261 will become &lt;, the greater-than sign will become &gt;, 

262 and any ampersands will become &amp;. If you want ampersands 

263 that appear to be part of an entity definition to be left 

264 alone, use substitute_xml_containing_entities() instead. 

265 

266 :param make_quoted_attribute: If True, then the string will be 

267 quoted, as befits an attribute value. 

268 """ 

269 # Escape angle brackets and ampersands. 

270 value = cls.AMPERSAND_OR_BRACKET.sub( 

271 cls._substitute_xml_entity, value) 

272 

273 if make_quoted_attribute: 

274 value = cls.quoted_attribute_value(value) 

275 return value 

276 

277 @classmethod 

278 def substitute_xml_containing_entities( 

279 cls, value, make_quoted_attribute=False): 

280 """Substitute XML entities for special XML characters. 

281 

282 :param value: A string to be substituted. The less-than sign will 

283 become &lt;, the greater-than sign will become &gt;, and any 

284 ampersands that are not part of an entity defition will 

285 become &amp;. 

286 

287 :param make_quoted_attribute: If True, then the string will be 

288 quoted, as befits an attribute value. 

289 """ 

290 # Escape angle brackets, and ampersands that aren't part of 

291 # entities. 

292 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 

293 cls._substitute_xml_entity, value) 

294 

295 if make_quoted_attribute: 

296 value = cls.quoted_attribute_value(value) 

297 return value 

298 

299 @classmethod 

300 def substitute_html(cls, s): 

301 """Replace certain Unicode characters with named HTML entities. 

302 

303 This differs from data.encode(encoding, 'xmlcharrefreplace') 

304 in that the goal is to make the result more readable (to those 

305 with ASCII displays) rather than to recover from 

306 errors. There's absolutely nothing wrong with a UTF-8 string 

307 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 

308 character with "&eacute;" will make it more readable to some 

309 people. 

310 

311 :param s: A Unicode string. 

312 """ 

313 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 

314 cls._substitute_html_entity, s) 

315 

316 

317class EncodingDetector: 

318 """Suggests a number of possible encodings for a bytestring. 

319 

320 Order of precedence: 

321 

322 1. Encodings you specifically tell EncodingDetector to try first 

323 (the known_definite_encodings argument to the constructor). 

324 

325 2. An encoding determined by sniffing the document's byte-order mark. 

326 

327 3. Encodings you specifically tell EncodingDetector to try if 

328 byte-order mark sniffing fails (the user_encodings argument to the 

329 constructor). 

330 

331 4. An encoding declared within the bytestring itself, either in an 

332 XML declaration (if the bytestring is to be interpreted as an XML 

333 document), or in a <meta> tag (if the bytestring is to be 

334 interpreted as an HTML document.) 

335 

336 5. An encoding detected through textual analysis by chardet, 

337 cchardet, or a similar external library. 

338 

339 4. UTF-8. 

340 

341 5. Windows-1252. 

342 

343 """ 

344 def __init__(self, markup, known_definite_encodings=None, 

345 is_html=False, exclude_encodings=None, 

346 user_encodings=None, override_encodings=None): 

347 """Constructor. 

348 

349 :param markup: Some markup in an unknown encoding. 

350 

351 :param known_definite_encodings: When determining the encoding 

352 of `markup`, these encodings will be tried first, in 

353 order. In HTML terms, this corresponds to the "known 

354 definite encoding" step defined here: 

355 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding 

356 

357 :param user_encodings: These encodings will be tried after the 

358 `known_definite_encodings` have been tried and failed, and 

359 after an attempt to sniff the encoding by looking at a 

360 byte order mark has failed. In HTML terms, this 

361 corresponds to the step "user has explicitly instructed 

362 the user agent to override the document's character 

363 encoding", defined here: 

364 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding 

365 

366 :param override_encodings: A deprecated alias for 

367 known_definite_encodings. Any encodings here will be tried 

368 immediately after the encodings in 

369 known_definite_encodings. 

370 

371 :param is_html: If True, this markup is considered to be 

372 HTML. Otherwise it's assumed to be XML. 

373 

374 :param exclude_encodings: These encodings will not be tried, 

375 even if they otherwise would be. 

376 

377 """ 

378 self.known_definite_encodings = list(known_definite_encodings or []) 

379 if override_encodings: 

380 self.known_definite_encodings += override_encodings 

381 self.user_encodings = user_encodings or [] 

382 exclude_encodings = exclude_encodings or [] 

383 self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 

384 self.chardet_encoding = None 

385 self.is_html = is_html 

386 self.declared_encoding = None 

387 

388 # First order of business: strip a byte-order mark. 

389 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 

390 

391 def _usable(self, encoding, tried): 

392 """Should we even bother to try this encoding? 

393 

394 :param encoding: Name of an encoding. 

395 :param tried: Encodings that have already been tried. This will be modified 

396 as a side effect. 

397 """ 

398 if encoding is not None: 

399 encoding = encoding.lower() 

400 if encoding in self.exclude_encodings: 

401 return False 

402 if encoding not in tried: 

403 tried.add(encoding) 

404 return True 

405 return False 

406 

407 @property 

408 def encodings(self): 

409 """Yield a number of encodings that might work for this markup. 

410 

411 :yield: A sequence of strings. 

412 """ 

413 tried = set() 

414 

415 # First, try the known definite encodings 

416 for e in self.known_definite_encodings: 

417 if self._usable(e, tried): 

418 yield e 

419 

420 # Did the document originally start with a byte-order mark 

421 # that indicated its encoding? 

422 if self._usable(self.sniffed_encoding, tried): 

423 yield self.sniffed_encoding 

424 

425 # Sniffing the byte-order mark did nothing; try the user 

426 # encodings. 

427 for e in self.user_encodings: 

428 if self._usable(e, tried): 

429 yield e 

430 

431 # Look within the document for an XML or HTML encoding 

432 # declaration. 

433 if self.declared_encoding is None: 

434 self.declared_encoding = self.find_declared_encoding( 

435 self.markup, self.is_html) 

436 if self._usable(self.declared_encoding, tried): 

437 yield self.declared_encoding 

438 

439 # Use third-party character set detection to guess at the 

440 # encoding. 

441 if self.chardet_encoding is None: 

442 self.chardet_encoding = chardet_dammit(self.markup) 

443 if self._usable(self.chardet_encoding, tried): 

444 yield self.chardet_encoding 

445 

446 # As a last-ditch effort, try utf-8 and windows-1252. 

447 for e in ('utf-8', 'windows-1252'): 

448 if self._usable(e, tried): 

449 yield e 

450 

451 @classmethod 

452 def strip_byte_order_mark(cls, data): 

453 """If a byte-order mark is present, strip it and return the encoding it implies. 

454 

455 :param data: Some markup. 

456 :return: A 2-tuple (modified data, implied encoding) 

457 """ 

458 encoding = None 

459 if isinstance(data, str): 

460 # Unicode data cannot have a byte-order mark. 

461 return data, encoding 

462 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 

463 and (data[2:4] != '\x00\x00'): 

464 encoding = 'utf-16be' 

465 data = data[2:] 

466 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 

467 and (data[2:4] != '\x00\x00'): 

468 encoding = 'utf-16le' 

469 data = data[2:] 

470 elif data[:3] == b'\xef\xbb\xbf': 

471 encoding = 'utf-8' 

472 data = data[3:] 

473 elif data[:4] == b'\x00\x00\xfe\xff': 

474 encoding = 'utf-32be' 

475 data = data[4:] 

476 elif data[:4] == b'\xff\xfe\x00\x00': 

477 encoding = 'utf-32le' 

478 data = data[4:] 

479 return data, encoding 

480 

481 @classmethod 

482 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 

483 """Given a document, tries to find its declared encoding. 

484 

485 An XML encoding is declared at the beginning of the document. 

486 

487 An HTML encoding is declared in a <meta> tag, hopefully near the 

488 beginning of the document. 

489 

490 :param markup: Some markup. 

491 :param is_html: If True, this markup is considered to be HTML. Otherwise 

492 it's assumed to be XML. 

493 :param search_entire_document: Since an encoding is supposed to declared near the beginning 

494 of the document, most of the time it's only necessary to search a few kilobytes of data. 

495 Set this to True to force this method to search the entire document. 

496 """ 

497 if search_entire_document: 

498 xml_endpos = html_endpos = len(markup) 

499 else: 

500 xml_endpos = 1024 

501 html_endpos = max(2048, int(len(markup) * 0.05)) 

502 

503 if isinstance(markup, bytes): 

504 res = encoding_res[bytes] 

505 else: 

506 res = encoding_res[str] 

507 

508 xml_re = res['xml'] 

509 html_re = res['html'] 

510 declared_encoding = None 

511 declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) 

512 if not declared_encoding_match and is_html: 

513 declared_encoding_match = html_re.search(markup, endpos=html_endpos) 

514 if declared_encoding_match is not None: 

515 declared_encoding = declared_encoding_match.groups()[0] 

516 if declared_encoding: 

517 if isinstance(declared_encoding, bytes): 

518 declared_encoding = declared_encoding.decode('ascii', 'replace') 

519 return declared_encoding.lower() 

520 return None 

521 

522class UnicodeDammit: 

523 """A class for detecting the encoding of a *ML document and 

524 converting it to a Unicode string. If the source encoding is 

525 windows-1252, can replace MS smart quotes with their HTML or XML 

526 equivalents.""" 

527 

528 # This dictionary maps commonly seen values for "charset" in HTML 

529 # meta tags to the corresponding Python codec names. It only covers 

530 # values that aren't in Python's aliases and can't be determined 

531 # by the heuristics in find_codec. 

532 CHARSET_ALIASES = {"macintosh": "mac-roman", 

533 "x-sjis": "shift-jis"} 

534 

535 ENCODINGS_WITH_SMART_QUOTES = [ 

536 "windows-1252", 

537 "iso-8859-1", 

538 "iso-8859-2", 

539 ] 

540 

541 def __init__(self, markup, known_definite_encodings=[], 

542 smart_quotes_to=None, is_html=False, exclude_encodings=[], 

543 user_encodings=None, override_encodings=None 

544 ): 

545 """Constructor. 

546 

547 :param markup: A bytestring representing markup in an unknown encoding. 

548 

549 :param known_definite_encodings: When determining the encoding 

550 of `markup`, these encodings will be tried first, in 

551 order. In HTML terms, this corresponds to the "known 

552 definite encoding" step defined here: 

553 https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding 

554 

555 :param user_encodings: These encodings will be tried after the 

556 `known_definite_encodings` have been tried and failed, and 

557 after an attempt to sniff the encoding by looking at a 

558 byte order mark has failed. In HTML terms, this 

559 corresponds to the step "user has explicitly instructed 

560 the user agent to override the document's character 

561 encoding", defined here: 

562 https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding 

563 

564 :param override_encodings: A deprecated alias for 

565 known_definite_encodings. Any encodings here will be tried 

566 immediately after the encodings in 

567 known_definite_encodings. 

568 

569 :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted 

570 to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. 

571 Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' 

572 will convert them to HTML entity references. 

573 :param is_html: If True, this markup is considered to be HTML. Otherwise 

574 it's assumed to be XML. 

575 :param exclude_encodings: These encodings will not be considered, even 

576 if the sniffing code thinks they might make sense. 

577 

578 """ 

579 self.smart_quotes_to = smart_quotes_to 

580 self.tried_encodings = [] 

581 self.contains_replacement_characters = False 

582 self.is_html = is_html 

583 self.log = logging.getLogger(__name__) 

584 self.detector = EncodingDetector( 

585 markup, known_definite_encodings, is_html, exclude_encodings, 

586 user_encodings, override_encodings 

587 ) 

588 

589 # Short-circuit if the data is in Unicode to begin with. 

590 if isinstance(markup, str) or markup == '': 

591 self.markup = markup 

592 self.unicode_markup = str(markup) 

593 self.original_encoding = None 

594 return 

595 

596 # The encoding detector may have stripped a byte-order mark. 

597 # Use the stripped markup from this point on. 

598 self.markup = self.detector.markup 

599 

600 u = None 

601 for encoding in self.detector.encodings: 

602 markup = self.detector.markup 

603 u = self._convert_from(encoding) 

604 if u is not None: 

605 break 

606 

607 if not u: 

608 # None of the encodings worked. As an absolute last resort, 

609 # try them again with character replacement. 

610 

611 for encoding in self.detector.encodings: 

612 if encoding != "ascii": 

613 u = self._convert_from(encoding, "replace") 

614 if u is not None: 

615 self.log.warning( 

616 "Some characters could not be decoded, and were " 

617 "replaced with REPLACEMENT CHARACTER." 

618 ) 

619 self.contains_replacement_characters = True 

620 break 

621 

622 # If none of that worked, we could at this point force it to 

623 # ASCII, but that would destroy so much data that I think 

624 # giving up is better. 

625 self.unicode_markup = u 

626 if not u: 

627 self.original_encoding = None 

628 

629 def _sub_ms_char(self, match): 

630 """Changes a MS smart quote character to an XML or HTML 

631 entity, or an ASCII character.""" 

632 orig = match.group(1) 

633 if self.smart_quotes_to == 'ascii': 

634 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 

635 else: 

636 sub = self.MS_CHARS.get(orig) 

637 if type(sub) == tuple: 

638 if self.smart_quotes_to == 'xml': 

639 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 

640 else: 

641 sub = '&'.encode() + sub[0].encode() + ';'.encode() 

642 else: 

643 sub = sub.encode() 

644 return sub 

645 

646 def _convert_from(self, proposed, errors="strict"): 

647 """Attempt to convert the markup to the proposed encoding. 

648 

649 :param proposed: The name of a character encoding. 

650 """ 

651 proposed = self.find_codec(proposed) 

652 if not proposed or (proposed, errors) in self.tried_encodings: 

653 return None 

654 self.tried_encodings.append((proposed, errors)) 

655 markup = self.markup 

656 # Convert smart quotes to HTML if coming from an encoding 

657 # that might have them. 

658 if (self.smart_quotes_to is not None 

659 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 

660 smart_quotes_re = b"([\x80-\x9f])" 

661 smart_quotes_compiled = re.compile(smart_quotes_re) 

662 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 

663 

664 try: 

665 #print("Trying to convert document to %s (errors=%s)" % ( 

666 # proposed, errors)) 

667 u = self._to_unicode(markup, proposed, errors) 

668 self.markup = u 

669 self.original_encoding = proposed 

670 except Exception as e: 

671 #print("That didn't work!") 

672 #print(e) 

673 return None 

674 #print("Correct encoding: %s" % proposed) 

675 return self.markup 

676 

677 def _to_unicode(self, data, encoding, errors="strict"): 

678 """Given a string and its encoding, decodes the string into Unicode. 

679 

680 :param encoding: The name of an encoding. 

681 """ 

682 return str(data, encoding, errors) 

683 

684 @property 

685 def declared_html_encoding(self): 

686 """If the markup is an HTML document, returns the encoding declared _within_ 

687 the document. 

688 """ 

689 if not self.is_html: 

690 return None 

691 return self.detector.declared_encoding 

692 

693 def find_codec(self, charset): 

694 """Convert the name of a character set to a codec name. 

695 

696 :param charset: The name of a character set. 

697 :return: The name of a codec. 

698 """ 

699 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 

700 or (charset and self._codec(charset.replace("-", ""))) 

701 or (charset and self._codec(charset.replace("-", "_"))) 

702 or (charset and charset.lower()) 

703 or charset 

704 ) 

705 if value: 

706 return value.lower() 

707 return None 

708 

709 def _codec(self, charset): 

710 if not charset: 

711 return charset 

712 codec = None 

713 try: 

714 codecs.lookup(charset) 

715 codec = charset 

716 except (LookupError, ValueError): 

717 pass 

718 return codec 

719 

720 

721 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 

722 MS_CHARS = {b'\x80': ('euro', '20AC'), 

723 b'\x81': ' ', 

724 b'\x82': ('sbquo', '201A'), 

725 b'\x83': ('fnof', '192'), 

726 b'\x84': ('bdquo', '201E'), 

727 b'\x85': ('hellip', '2026'), 

728 b'\x86': ('dagger', '2020'), 

729 b'\x87': ('Dagger', '2021'), 

730 b'\x88': ('circ', '2C6'), 

731 b'\x89': ('permil', '2030'), 

732 b'\x8A': ('Scaron', '160'), 

733 b'\x8B': ('lsaquo', '2039'), 

734 b'\x8C': ('OElig', '152'), 

735 b'\x8D': '?', 

736 b'\x8E': ('#x17D', '17D'), 

737 b'\x8F': '?', 

738 b'\x90': '?', 

739 b'\x91': ('lsquo', '2018'), 

740 b'\x92': ('rsquo', '2019'), 

741 b'\x93': ('ldquo', '201C'), 

742 b'\x94': ('rdquo', '201D'), 

743 b'\x95': ('bull', '2022'), 

744 b'\x96': ('ndash', '2013'), 

745 b'\x97': ('mdash', '2014'), 

746 b'\x98': ('tilde', '2DC'), 

747 b'\x99': ('trade', '2122'), 

748 b'\x9a': ('scaron', '161'), 

749 b'\x9b': ('rsaquo', '203A'), 

750 b'\x9c': ('oelig', '153'), 

751 b'\x9d': '?', 

752 b'\x9e': ('#x17E', '17E'), 

753 b'\x9f': ('Yuml', ''),} 

754 

755 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 

756 # horrors like stripping diacritical marks to turn á into a, but also 

757 # contains non-horrors like turning “ into ". 

758 MS_CHARS_TO_ASCII = { 

759 b'\x80' : 'EUR', 

760 b'\x81' : ' ', 

761 b'\x82' : ',', 

762 b'\x83' : 'f', 

763 b'\x84' : ',,', 

764 b'\x85' : '...', 

765 b'\x86' : '+', 

766 b'\x87' : '++', 

767 b'\x88' : '^', 

768 b'\x89' : '%', 

769 b'\x8a' : 'S', 

770 b'\x8b' : '<', 

771 b'\x8c' : 'OE', 

772 b'\x8d' : '?', 

773 b'\x8e' : 'Z', 

774 b'\x8f' : '?', 

775 b'\x90' : '?', 

776 b'\x91' : "'", 

777 b'\x92' : "'", 

778 b'\x93' : '"', 

779 b'\x94' : '"', 

780 b'\x95' : '*', 

781 b'\x96' : '-', 

782 b'\x97' : '--', 

783 b'\x98' : '~', 

784 b'\x99' : '(TM)', 

785 b'\x9a' : 's', 

786 b'\x9b' : '>', 

787 b'\x9c' : 'oe', 

788 b'\x9d' : '?', 

789 b'\x9e' : 'z', 

790 b'\x9f' : 'Y', 

791 b'\xa0' : ' ', 

792 b'\xa1' : '!', 

793 b'\xa2' : 'c', 

794 b'\xa3' : 'GBP', 

795 b'\xa4' : '$', #This approximation is especially parochial--this is the 

796 #generic currency symbol. 

797 b'\xa5' : 'YEN', 

798 b'\xa6' : '|', 

799 b'\xa7' : 'S', 

800 b'\xa8' : '..', 

801 b'\xa9' : '', 

802 b'\xaa' : '(th)', 

803 b'\xab' : '<<', 

804 b'\xac' : '!', 

805 b'\xad' : ' ', 

806 b'\xae' : '(R)', 

807 b'\xaf' : '-', 

808 b'\xb0' : 'o', 

809 b'\xb1' : '+-', 

810 b'\xb2' : '2', 

811 b'\xb3' : '3', 

812 b'\xb4' : ("'", 'acute'), 

813 b'\xb5' : 'u', 

814 b'\xb6' : 'P', 

815 b'\xb7' : '*', 

816 b'\xb8' : ',', 

817 b'\xb9' : '1', 

818 b'\xba' : '(th)', 

819 b'\xbb' : '>>', 

820 b'\xbc' : '1/4', 

821 b'\xbd' : '1/2', 

822 b'\xbe' : '3/4', 

823 b'\xbf' : '?', 

824 b'\xc0' : 'A', 

825 b'\xc1' : 'A', 

826 b'\xc2' : 'A', 

827 b'\xc3' : 'A', 

828 b'\xc4' : 'A', 

829 b'\xc5' : 'A', 

830 b'\xc6' : 'AE', 

831 b'\xc7' : 'C', 

832 b'\xc8' : 'E', 

833 b'\xc9' : 'E', 

834 b'\xca' : 'E', 

835 b'\xcb' : 'E', 

836 b'\xcc' : 'I', 

837 b'\xcd' : 'I', 

838 b'\xce' : 'I', 

839 b'\xcf' : 'I', 

840 b'\xd0' : 'D', 

841 b'\xd1' : 'N', 

842 b'\xd2' : 'O', 

843 b'\xd3' : 'O', 

844 b'\xd4' : 'O', 

845 b'\xd5' : 'O', 

846 b'\xd6' : 'O', 

847 b'\xd7' : '*', 

848 b'\xd8' : 'O', 

849 b'\xd9' : 'U', 

850 b'\xda' : 'U', 

851 b'\xdb' : 'U', 

852 b'\xdc' : 'U', 

853 b'\xdd' : 'Y', 

854 b'\xde' : 'b', 

855 b'\xdf' : 'B', 

856 b'\xe0' : 'a', 

857 b'\xe1' : 'a', 

858 b'\xe2' : 'a', 

859 b'\xe3' : 'a', 

860 b'\xe4' : 'a', 

861 b'\xe5' : 'a', 

862 b'\xe6' : 'ae', 

863 b'\xe7' : 'c', 

864 b'\xe8' : 'e', 

865 b'\xe9' : 'e', 

866 b'\xea' : 'e', 

867 b'\xeb' : 'e', 

868 b'\xec' : 'i', 

869 b'\xed' : 'i', 

870 b'\xee' : 'i', 

871 b'\xef' : 'i', 

872 b'\xf0' : 'o', 

873 b'\xf1' : 'n', 

874 b'\xf2' : 'o', 

875 b'\xf3' : 'o', 

876 b'\xf4' : 'o', 

877 b'\xf5' : 'o', 

878 b'\xf6' : 'o', 

879 b'\xf7' : '/', 

880 b'\xf8' : 'o', 

881 b'\xf9' : 'u', 

882 b'\xfa' : 'u', 

883 b'\xfb' : 'u', 

884 b'\xfc' : 'u', 

885 b'\xfd' : 'y', 

886 b'\xfe' : 'b', 

887 b'\xff' : 'y', 

888 } 

889 

890 # A map used when removing rogue Windows-1252/ISO-8859-1 

891 # characters in otherwise UTF-8 documents. 

892 # 

893 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 

894 # Windows-1252. 

895 WINDOWS_1252_TO_UTF8 = { 

896 0x80 : b'\xe2\x82\xac', # € 

897 0x82 : b'\xe2\x80\x9a', # ‚ 

898 0x83 : b'\xc6\x92', # ƒ 

899 0x84 : b'\xe2\x80\x9e', # „ 

900 0x85 : b'\xe2\x80\xa6', # … 

901 0x86 : b'\xe2\x80\xa0', # † 

902 0x87 : b'\xe2\x80\xa1', # ‡ 

903 0x88 : b'\xcb\x86', # ˆ 

904 0x89 : b'\xe2\x80\xb0', # ‰ 

905 0x8a : b'\xc5\xa0', # Š 

906 0x8b : b'\xe2\x80\xb9', # ‹ 

907 0x8c : b'\xc5\x92', # Œ 

908 0x8e : b'\xc5\xbd', # Ž 

909 0x91 : b'\xe2\x80\x98', # ‘ 

910 0x92 : b'\xe2\x80\x99', # ’ 

911 0x93 : b'\xe2\x80\x9c', # “ 

912 0x94 : b'\xe2\x80\x9d', # ” 

913 0x95 : b'\xe2\x80\xa2', # • 

914 0x96 : b'\xe2\x80\x93', # – 

915 0x97 : b'\xe2\x80\x94', # — 

916 0x98 : b'\xcb\x9c', # ˜ 

917 0x99 : b'\xe2\x84\xa2', # ™ 

918 0x9a : b'\xc5\xa1', # š 

919 0x9b : b'\xe2\x80\xba', # › 

920 0x9c : b'\xc5\x93', # œ 

921 0x9e : b'\xc5\xbe', # ž 

922 0x9f : b'\xc5\xb8', # Ÿ 

923 0xa0 : b'\xc2\xa0', #   

924 0xa1 : b'\xc2\xa1', # ¡ 

925 0xa2 : b'\xc2\xa2', # ¢ 

926 0xa3 : b'\xc2\xa3', # £ 

927 0xa4 : b'\xc2\xa4', # ¤ 

928 0xa5 : b'\xc2\xa5', # ¥ 

929 0xa6 : b'\xc2\xa6', # ¦ 

930 0xa7 : b'\xc2\xa7', # § 

931 0xa8 : b'\xc2\xa8', # ¨ 

932 0xa9 : b'\xc2\xa9', # © 

933 0xaa : b'\xc2\xaa', # ª 

934 0xab : b'\xc2\xab', # « 

935 0xac : b'\xc2\xac', # ¬ 

936 0xad : b'\xc2\xad', # ­ 

937 0xae : b'\xc2\xae', # ® 

938 0xaf : b'\xc2\xaf', # ¯ 

939 0xb0 : b'\xc2\xb0', # ° 

940 0xb1 : b'\xc2\xb1', # ± 

941 0xb2 : b'\xc2\xb2', # ² 

942 0xb3 : b'\xc2\xb3', # ³ 

943 0xb4 : b'\xc2\xb4', # ´ 

944 0xb5 : b'\xc2\xb5', # µ 

945 0xb6 : b'\xc2\xb6', # ¶ 

946 0xb7 : b'\xc2\xb7', # · 

947 0xb8 : b'\xc2\xb8', # ¸ 

948 0xb9 : b'\xc2\xb9', # ¹ 

949 0xba : b'\xc2\xba', # º 

950 0xbb : b'\xc2\xbb', # » 

951 0xbc : b'\xc2\xbc', # ¼ 

952 0xbd : b'\xc2\xbd', # ½ 

953 0xbe : b'\xc2\xbe', # ¾ 

954 0xbf : b'\xc2\xbf', # ¿ 

955 0xc0 : b'\xc3\x80', # À 

956 0xc1 : b'\xc3\x81', # Á 

957 0xc2 : b'\xc3\x82', #  

958 0xc3 : b'\xc3\x83', # à

959 0xc4 : b'\xc3\x84', # Ä 

960 0xc5 : b'\xc3\x85', # Å 

961 0xc6 : b'\xc3\x86', # Æ 

962 0xc7 : b'\xc3\x87', # Ç 

963 0xc8 : b'\xc3\x88', # È 

964 0xc9 : b'\xc3\x89', # É 

965 0xca : b'\xc3\x8a', # Ê 

966 0xcb : b'\xc3\x8b', # Ë 

967 0xcc : b'\xc3\x8c', # Ì 

968 0xcd : b'\xc3\x8d', # Í 

969 0xce : b'\xc3\x8e', # Π

970 0xcf : b'\xc3\x8f', # Ï 

971 0xd0 : b'\xc3\x90', # Р

972 0xd1 : b'\xc3\x91', # Ñ 

973 0xd2 : b'\xc3\x92', # Ò 

974 0xd3 : b'\xc3\x93', # Ó 

975 0xd4 : b'\xc3\x94', # Ô 

976 0xd5 : b'\xc3\x95', # Õ 

977 0xd6 : b'\xc3\x96', # Ö 

978 0xd7 : b'\xc3\x97', # × 

979 0xd8 : b'\xc3\x98', # Ø 

980 0xd9 : b'\xc3\x99', # Ù 

981 0xda : b'\xc3\x9a', # Ú 

982 0xdb : b'\xc3\x9b', # Û 

983 0xdc : b'\xc3\x9c', # Ü 

984 0xdd : b'\xc3\x9d', # Ý 

985 0xde : b'\xc3\x9e', # Þ 

986 0xdf : b'\xc3\x9f', # ß 

987 0xe0 : b'\xc3\xa0', # à 

988 0xe1 : b'\xa1', # á 

989 0xe2 : b'\xc3\xa2', # â 

990 0xe3 : b'\xc3\xa3', # ã 

991 0xe4 : b'\xc3\xa4', # ä 

992 0xe5 : b'\xc3\xa5', # å 

993 0xe6 : b'\xc3\xa6', # æ 

994 0xe7 : b'\xc3\xa7', # ç 

995 0xe8 : b'\xc3\xa8', # è 

996 0xe9 : b'\xc3\xa9', # é 

997 0xea : b'\xc3\xaa', # ê 

998 0xeb : b'\xc3\xab', # ë 

999 0xec : b'\xc3\xac', # ì 

1000 0xed : b'\xc3\xad', # í 

1001 0xee : b'\xc3\xae', # î 

1002 0xef : b'\xc3\xaf', # ï 

1003 0xf0 : b'\xc3\xb0', # ð 

1004 0xf1 : b'\xc3\xb1', # ñ 

1005 0xf2 : b'\xc3\xb2', # ò 

1006 0xf3 : b'\xc3\xb3', # ó 

1007 0xf4 : b'\xc3\xb4', # ô 

1008 0xf5 : b'\xc3\xb5', # õ 

1009 0xf6 : b'\xc3\xb6', # ö 

1010 0xf7 : b'\xc3\xb7', # ÷ 

1011 0xf8 : b'\xc3\xb8', # ø 

1012 0xf9 : b'\xc3\xb9', # ù 

1013 0xfa : b'\xc3\xba', # ú 

1014 0xfb : b'\xc3\xbb', # û 

1015 0xfc : b'\xc3\xbc', # ü 

1016 0xfd : b'\xc3\xbd', # ý 

1017 0xfe : b'\xc3\xbe', # þ 

1018 } 

1019 

1020 MULTIBYTE_MARKERS_AND_SIZES = [ 

1021 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 

1022 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 

1023 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 

1024 ] 

1025 

1026 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 

1027 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 

1028 

1029 @classmethod 

1030 def detwingle(cls, in_bytes, main_encoding="utf8", 

1031 embedded_encoding="windows-1252"): 

1032 """Fix characters from one encoding embedded in some other encoding. 

1033 

1034 Currently the only situation supported is Windows-1252 (or its 

1035 subset ISO-8859-1), embedded in UTF-8. 

1036 

1037 :param in_bytes: A bytestring that you suspect contains 

1038 characters from multiple encodings. Note that this _must_ 

1039 be a bytestring. If you've already converted the document 

1040 to Unicode, you're too late. 

1041 :param main_encoding: The primary encoding of `in_bytes`. 

1042 :param embedded_encoding: The encoding that was used to embed characters 

1043 in the main document. 

1044 :return: A bytestring in which `embedded_encoding` 

1045 characters have been converted to their `main_encoding` 

1046 equivalents. 

1047 """ 

1048 if embedded_encoding.replace('_', '-').lower() not in ( 

1049 'windows-1252', 'windows_1252'): 

1050 raise NotImplementedError( 

1051 "Windows-1252 and ISO-8859-1 are the only currently supported " 

1052 "embedded encodings.") 

1053 

1054 if main_encoding.lower() not in ('utf8', 'utf-8'): 

1055 raise NotImplementedError( 

1056 "UTF-8 is the only currently supported main encoding.") 

1057 

1058 byte_chunks = [] 

1059 

1060 chunk_start = 0 

1061 pos = 0 

1062 while pos < len(in_bytes): 

1063 byte = in_bytes[pos] 

1064 if not isinstance(byte, int): 

1065 # Python 2.x 

1066 byte = ord(byte) 

1067 if (byte >= cls.FIRST_MULTIBYTE_MARKER 

1068 and byte <= cls.LAST_MULTIBYTE_MARKER): 

1069 # This is the start of a UTF-8 multibyte character. Skip 

1070 # to the end. 

1071 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 

1072 if byte >= start and byte <= end: 

1073 pos += size 

1074 break 

1075 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 

1076 # We found a Windows-1252 character! 

1077 # Save the string up to this point as a chunk. 

1078 byte_chunks.append(in_bytes[chunk_start:pos]) 

1079 

1080 # Now translate the Windows-1252 character into UTF-8 

1081 # and add it as another, one-byte chunk. 

1082 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 

1083 pos += 1 

1084 chunk_start = pos 

1085 else: 

1086 # Go on to the next character. 

1087 pos += 1 

1088 if chunk_start == 0: 

1089 # The string is unchanged. 

1090 return in_bytes 

1091 else: 

1092 # Store the final chunk. 

1093 byte_chunks.append(in_bytes[chunk_start:]) 

1094 return b''.join(byte_chunks) 

1095