1# encoding: utf-8
2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
7__all__ = [
8 'HTMLParserTreeBuilder',
9 ]
10
11from html.parser import HTMLParser
12
13try:
14 from html.parser import HTMLParseError
15except ImportError as e:
16 # HTMLParseError is removed in Python 3.5. Since it can never be
17 # thrown in 3.5, we can just define our own class as a placeholder.
18 class HTMLParseError(Exception):
19 pass
20
21import sys
22import warnings
23
24# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25# argument, which we'd like to set to False. Unfortunately,
26# http://bugs.python.org/issue13273 makes strict=True a better bet
27# before Python 3.2.3.
28#
29# At the end of this file, we monkeypatch HTMLParser so that
30# strict=True works well on Python 3.2.2.
31major, minor, release = sys.version_info[:3]
32CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35
36
37from bs4.element import (
38 CData,
39 Comment,
40 Declaration,
41 Doctype,
42 ProcessingInstruction,
43 )
44from bs4.dammit import EntitySubstitution, UnicodeDammit
45
46from bs4.builder import (
47 DetectsXMLParsedAsHTML,
48 HTML,
49 HTMLTreeBuilder,
50 STRICT,
51 )
52
53
54HTMLPARSER = 'html.parser'
55
56class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
57 """A subclass of the Python standard library's HTMLParser class, which
58 listens for HTMLParser events and translates them into calls
59 to Beautiful Soup's tree construction API.
60 """
61
62 # Strategies for handling duplicate attributes
63 IGNORE = 'ignore'
64 REPLACE = 'replace'
65
66 def __init__(self, *args, **kwargs):
67 """Constructor.
68
69 :param on_duplicate_attribute: A strategy for what to do if a
70 tag includes the same attribute more than once. Accepted
71 values are: REPLACE (replace earlier values with later
72 ones, the default), IGNORE (keep the earliest value
73 encountered), or a callable. A callable must take three
74 arguments: the dictionary of attributes already processed,
75 the name of the duplicate attribute, and the most recent value
76 encountered.
77 """
78 self.on_duplicate_attribute = kwargs.pop(
79 'on_duplicate_attribute', self.REPLACE
80 )
81 HTMLParser.__init__(self, *args, **kwargs)
82
83 # Keep a list of empty-element tags that were encountered
84 # without an explicit closing tag. If we encounter a closing tag
85 # of this type, we'll associate it with one of those entries.
86 #
87 # This isn't a stack because we don't care about the
88 # order. It's a list of closing tags we've already handled and
89 # will ignore, assuming they ever show up.
90 self.already_closed_empty_element = []
91
92 self._initialize_xml_detector()
93
94 def error(self, msg):
95 """In Python 3, HTMLParser subclasses must implement error(), although
96 this requirement doesn't appear to be documented.
97
98 In Python 2, HTMLParser implements error() by raising an exception,
99 which we don't want to do.
100
101 In any event, this method is called only on very strange
102 markup and our best strategy is to pretend it didn't happen
103 and keep going.
104 """
105 warnings.warn(msg)
106
107 def handle_startendtag(self, name, attrs):
108 """Handle an incoming empty-element tag.
109
110 This is only called when the markup looks like <tag/>.
111
112 :param name: Name of the tag.
113 :param attrs: Dictionary of the tag's attributes.
114 """
115 # is_startend() tells handle_starttag not to close the tag
116 # just because its name matches a known empty-element tag. We
117 # know that this is an empty-element tag and we want to call
118 # handle_endtag ourselves.
119 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
120 self.handle_endtag(name)
121
122 def handle_starttag(self, name, attrs, handle_empty_element=True):
123 """Handle an opening tag, e.g. '<tag>'
124
125 :param name: Name of the tag.
126 :param attrs: Dictionary of the tag's attributes.
127 :param handle_empty_element: True if this tag is known to be
128 an empty-element tag (i.e. there is not expected to be any
129 closing tag).
130 """
131 # XXX namespace
132 attr_dict = {}
133 for key, value in attrs:
134 # Change None attribute values to the empty string
135 # for consistency with the other tree builders.
136 if value is None:
137 value = ''
138 if key in attr_dict:
139 # A single attribute shows up multiple times in this
140 # tag. How to handle it depends on the
141 # on_duplicate_attribute setting.
142 on_dupe = self.on_duplicate_attribute
143 if on_dupe == self.IGNORE:
144 pass
145 elif on_dupe in (None, self.REPLACE):
146 attr_dict[key] = value
147 else:
148 on_dupe(attr_dict, key, value)
149 else:
150 attr_dict[key] = value
151 attrvalue = '""'
152 #print("START", name)
153 sourceline, sourcepos = self.getpos()
154 tag = self.soup.handle_starttag(
155 name, None, None, attr_dict, sourceline=sourceline,
156 sourcepos=sourcepos
157 )
158 if tag and tag.is_empty_element and handle_empty_element:
159 # Unlike other parsers, html.parser doesn't send separate end tag
160 # events for empty-element tags. (It's handled in
161 # handle_startendtag, but only if the original markup looked like
162 # <tag/>.)
163 #
164 # So we need to call handle_endtag() ourselves. Since we
165 # know the start event is identical to the end event, we
166 # don't want handle_endtag() to cross off any previous end
167 # events for tags of this name.
168 self.handle_endtag(name, check_already_closed=False)
169
170 # But we might encounter an explicit closing tag for this tag
171 # later on. If so, we want to ignore it.
172 self.already_closed_empty_element.append(name)
173
174 if self._root_tag is None:
175 self._root_tag_encountered(name)
176
177 def handle_endtag(self, name, check_already_closed=True):
178 """Handle a closing tag, e.g. '</tag>'
179
180 :param name: A tag name.
181 :param check_already_closed: True if this tag is expected to
182 be the closing portion of an empty-element tag,
183 e.g. '<tag></tag>'.
184 """
185 #print("END", name)
186 if check_already_closed and name in self.already_closed_empty_element:
187 # This is a redundant end tag for an empty-element tag.
188 # We've already called handle_endtag() for it, so just
189 # check it off the list.
190 #print("ALREADY CLOSED", name)
191 self.already_closed_empty_element.remove(name)
192 else:
193 self.soup.handle_endtag(name)
194
195 def handle_data(self, data):
196 """Handle some textual data that shows up between tags."""
197 self.soup.handle_data(data)
198
199 def handle_charref(self, name):
200 """Handle a numeric character reference by converting it to the
201 corresponding Unicode character and treating it as textual
202 data.
203
204 :param name: Character number, possibly in hexadecimal.
205 """
206 # XXX workaround for a bug in HTMLParser. Remove this once
207 # it's fixed in all supported versions.
208 # http://bugs.python.org/issue13633
209 if name.startswith('x'):
210 real_name = int(name.lstrip('x'), 16)
211 elif name.startswith('X'):
212 real_name = int(name.lstrip('X'), 16)
213 else:
214 real_name = int(name)
215
216 data = None
217 if real_name < 256:
218 # HTML numeric entities are supposed to reference Unicode
219 # code points, but sometimes they reference code points in
220 # some other encoding (ahem, Windows-1252). E.g. “
221 # instead of É for LEFT DOUBLE QUOTATION MARK. This
222 # code tries to detect this situation and compensate.
223 for encoding in (self.soup.original_encoding, 'windows-1252'):
224 if not encoding:
225 continue
226 try:
227 data = bytearray([real_name]).decode(encoding)
228 except UnicodeDecodeError as e:
229 pass
230 if not data:
231 try:
232 data = chr(real_name)
233 except (ValueError, OverflowError) as e:
234 pass
235 data = data or "\N{REPLACEMENT CHARACTER}"
236 self.handle_data(data)
237
238 def handle_entityref(self, name):
239 """Handle a named entity reference by converting it to the
240 corresponding Unicode character(s) and treating it as textual
241 data.
242
243 :param name: Name of the entity reference.
244 """
245 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
246 if character is not None:
247 data = character
248 else:
249 # If this were XML, it would be ambiguous whether "&foo"
250 # was an character entity reference with a missing
251 # semicolon or the literal string "&foo". Since this is
252 # HTML, we have a complete list of all character entity references,
253 # and this one wasn't found, so assume it's the literal string "&foo".
254 data = "&%s" % name
255 self.handle_data(data)
256
257 def handle_comment(self, data):
258 """Handle an HTML comment.
259
260 :param data: The text of the comment.
261 """
262 self.soup.endData()
263 self.soup.handle_data(data)
264 self.soup.endData(Comment)
265
266 def handle_decl(self, data):
267 """Handle a DOCTYPE declaration.
268
269 :param data: The text of the declaration.
270 """
271 self.soup.endData()
272 data = data[len("DOCTYPE "):]
273 self.soup.handle_data(data)
274 self.soup.endData(Doctype)
275
276 def unknown_decl(self, data):
277 """Handle a declaration of unknown type -- probably a CDATA block.
278
279 :param data: The text of the declaration.
280 """
281 if data.upper().startswith('CDATA['):
282 cls = CData
283 data = data[len('CDATA['):]
284 else:
285 cls = Declaration
286 self.soup.endData()
287 self.soup.handle_data(data)
288 self.soup.endData(cls)
289
290 def handle_pi(self, data):
291 """Handle a processing instruction.
292
293 :param data: The text of the instruction.
294 """
295 self.soup.endData()
296 self.soup.handle_data(data)
297 self._document_might_be_xml(data)
298 self.soup.endData(ProcessingInstruction)
299
300
301class HTMLParserTreeBuilder(HTMLTreeBuilder):
302 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
303 found in the Python standard library.
304 """
305 is_xml = False
306 picklable = True
307 NAME = HTMLPARSER
308 features = [NAME, HTML, STRICT]
309
310 # The html.parser knows which line number and position in the
311 # original file is the source of an element.
312 TRACKS_LINE_NUMBERS = True
313
314 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
315 """Constructor.
316
317 :param parser_args: Positional arguments to pass into
318 the BeautifulSoupHTMLParser constructor, once it's
319 invoked.
320 :param parser_kwargs: Keyword arguments to pass into
321 the BeautifulSoupHTMLParser constructor, once it's
322 invoked.
323 :param kwargs: Keyword arguments for the superclass constructor.
324 """
325 # Some keyword arguments will be pulled out of kwargs and placed
326 # into parser_kwargs.
327 extra_parser_kwargs = dict()
328 for arg in ('on_duplicate_attribute',):
329 if arg in kwargs:
330 value = kwargs.pop(arg)
331 extra_parser_kwargs[arg] = value
332 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
333 parser_args = parser_args or []
334 parser_kwargs = parser_kwargs or {}
335 parser_kwargs.update(extra_parser_kwargs)
336 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
337 parser_kwargs['strict'] = False
338 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
339 parser_kwargs['convert_charrefs'] = False
340 self.parser_args = (parser_args, parser_kwargs)
341
342 def prepare_markup(self, markup, user_specified_encoding=None,
343 document_declared_encoding=None, exclude_encodings=None):
344
345 """Run any preliminary steps necessary to make incoming markup
346 acceptable to the parser.
347
348 :param markup: Some markup -- probably a bytestring.
349 :param user_specified_encoding: The user asked to try this encoding.
350 :param document_declared_encoding: The markup itself claims to be
351 in this encoding.
352 :param exclude_encodings: The user asked _not_ to try any of
353 these encodings.
354
355 :yield: A series of 4-tuples:
356 (markup, encoding, declared encoding,
357 has undergone character replacement)
358
359 Each 4-tuple represents a strategy for converting the
360 document to Unicode and parsing it. Each strategy will be tried
361 in turn.
362 """
363 if isinstance(markup, str):
364 # Parse Unicode as-is.
365 yield (markup, None, None, False)
366 return
367
368 # Ask UnicodeDammit to sniff the most likely encoding.
369
370 # This was provided by the end-user; treat it as a known
371 # definite encoding per the algorithm laid out in the HTML5
372 # spec. (See the EncodingDetector class for details.)
373 known_definite_encodings = [user_specified_encoding]
374
375 # This was found in the document; treat it as a slightly lower-priority
376 # user encoding.
377 user_encodings = [document_declared_encoding]
378
379 try_encodings = [user_specified_encoding, document_declared_encoding]
380 dammit = UnicodeDammit(
381 markup,
382 known_definite_encodings=known_definite_encodings,
383 user_encodings=user_encodings,
384 is_html=True,
385 exclude_encodings=exclude_encodings
386 )
387 yield (dammit.markup, dammit.original_encoding,
388 dammit.declared_html_encoding,
389 dammit.contains_replacement_characters)
390
391 def feed(self, markup):
392 """Run some incoming markup through some parsing process,
393 populating the `BeautifulSoup` object in self.soup.
394 """
395 args, kwargs = self.parser_args
396 parser = BeautifulSoupHTMLParser(*args, **kwargs)
397 parser.soup = self.soup
398 try:
399 parser.feed(markup)
400 parser.close()
401 except HTMLParseError as e:
402 warnings.warn(RuntimeWarning(
403 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
404 raise e
405 parser.already_closed_empty_element = []
406
407# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
408# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
409# string.
410#
411# XXX This code can be removed once most Python 3 users are on 3.2.3.
412if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
413 import re
414 attrfind_tolerant = re.compile(
415 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
416 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
417 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
418
419 locatestarttagend = re.compile(r"""
420 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
421 (?:\s+ # whitespace before attribute name
422 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
423 (?:\s*=\s* # value indicator
424 (?:'[^']*' # LITA-enclosed value
425 |\"[^\"]*\" # LIT-enclosed value
426 |[^'\">\s]+ # bare value
427 )
428 )?
429 )
430 )*
431 \s* # trailing whitespace
432""", re.VERBOSE)
433 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
434
435 from html.parser import tagfind, attrfind
436
437 def parse_starttag(self, i):
438 self.__starttag_text = None
439 endpos = self.check_for_whole_start_tag(i)
440 if endpos < 0:
441 return endpos
442 rawdata = self.rawdata
443 self.__starttag_text = rawdata[i:endpos]
444
445 # Now parse the data between i+1 and j into a tag and attrs
446 attrs = []
447 match = tagfind.match(rawdata, i+1)
448 assert match, 'unexpected call to parse_starttag()'
449 k = match.end()
450 self.lasttag = tag = rawdata[i+1:k].lower()
451 while k < endpos:
452 if self.strict:
453 m = attrfind.match(rawdata, k)
454 else:
455 m = attrfind_tolerant.match(rawdata, k)
456 if not m:
457 break
458 attrname, rest, attrvalue = m.group(1, 2, 3)
459 if not rest:
460 attrvalue = None
461 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
462 attrvalue[:1] == '"' == attrvalue[-1:]:
463 attrvalue = attrvalue[1:-1]
464 if attrvalue:
465 attrvalue = self.unescape(attrvalue)
466 attrs.append((attrname.lower(), attrvalue))
467 k = m.end()
468
469 end = rawdata[k:endpos].strip()
470 if end not in (">", "/>"):
471 lineno, offset = self.getpos()
472 if "\n" in self.__starttag_text:
473 lineno = lineno + self.__starttag_text.count("\n")
474 offset = len(self.__starttag_text) \
475 - self.__starttag_text.rfind("\n")
476 else:
477 offset = offset + len(self.__starttag_text)
478 if self.strict:
479 self.error("junk characters in start tag: %r"
480 % (rawdata[k:endpos][:20],))
481 self.handle_data(rawdata[i:endpos])
482 return endpos
483 if end.endswith('/>'):
484 # XHTML-style empty tag: <span attr="value" />
485 self.handle_startendtag(tag, attrs)
486 else:
487 self.handle_starttag(tag, attrs)
488 if tag in self.CDATA_CONTENT_ELEMENTS:
489 self.set_cdata_mode(tag)
490 return endpos
491
492 def set_cdata_mode(self, elem):
493 self.cdata_elem = elem.lower()
494 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
495
496 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
497 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
498
499 CONSTRUCTOR_TAKES_STRICT = True