Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_htmlparser.py: 25%
137 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1# encoding: utf-8
2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
7__all__ = [
8 'HTMLParserTreeBuilder',
9 ]
11from html.parser import HTMLParser
13import sys
14import warnings
16from bs4.element import (
17 CData,
18 Comment,
19 Declaration,
20 Doctype,
21 ProcessingInstruction,
22 )
23from bs4.dammit import EntitySubstitution, UnicodeDammit
25from bs4.builder import (
26 DetectsXMLParsedAsHTML,
27 ParserRejectedMarkup,
28 HTML,
29 HTMLTreeBuilder,
30 STRICT,
31 )
34HTMLPARSER = 'html.parser'
36class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
37 """A subclass of the Python standard library's HTMLParser class, which
38 listens for HTMLParser events and translates them into calls
39 to Beautiful Soup's tree construction API.
40 """
42 # Strategies for handling duplicate attributes
43 IGNORE = 'ignore'
44 REPLACE = 'replace'
46 def __init__(self, *args, **kwargs):
47 """Constructor.
49 :param on_duplicate_attribute: A strategy for what to do if a
50 tag includes the same attribute more than once. Accepted
51 values are: REPLACE (replace earlier values with later
52 ones, the default), IGNORE (keep the earliest value
53 encountered), or a callable. A callable must take three
54 arguments: the dictionary of attributes already processed,
55 the name of the duplicate attribute, and the most recent value
56 encountered.
57 """
58 self.on_duplicate_attribute = kwargs.pop(
59 'on_duplicate_attribute', self.REPLACE
60 )
61 HTMLParser.__init__(self, *args, **kwargs)
63 # Keep a list of empty-element tags that were encountered
64 # without an explicit closing tag. If we encounter a closing tag
65 # of this type, we'll associate it with one of those entries.
66 #
67 # This isn't a stack because we don't care about the
68 # order. It's a list of closing tags we've already handled and
69 # will ignore, assuming they ever show up.
70 self.already_closed_empty_element = []
72 self._initialize_xml_detector()
74 def error(self, message):
75 # NOTE: This method is required so long as Python 3.9 is
76 # supported. The corresponding code is removed from HTMLParser
77 # in 3.5, but not removed from ParserBase until 3.10.
78 # https://github.com/python/cpython/issues/76025
79 #
80 # The original implementation turned the error into a warning,
81 # but in every case I discovered, this made HTMLParser
82 # immediately crash with an error message that was less
83 # helpful than the warning. The new implementation makes it
84 # more clear that html.parser just can't parse this
85 # markup. The 3.10 implementation does the same, though it
86 # raises AssertionError rather than calling a method. (We
87 # catch this error and wrap it in a ParserRejectedMarkup.)
88 raise ParserRejectedMarkup(message)
90 def handle_startendtag(self, name, attrs):
91 """Handle an incoming empty-element tag.
93 This is only called when the markup looks like <tag/>.
95 :param name: Name of the tag.
96 :param attrs: Dictionary of the tag's attributes.
97 """
98 # is_startend() tells handle_starttag not to close the tag
99 # just because its name matches a known empty-element tag. We
100 # know that this is an empty-element tag and we want to call
101 # handle_endtag ourselves.
102 tag = self.handle_starttag(name, attrs, handle_empty_element=False)
103 self.handle_endtag(name)
105 def handle_starttag(self, name, attrs, handle_empty_element=True):
106 """Handle an opening tag, e.g. '<tag>'
108 :param name: Name of the tag.
109 :param attrs: Dictionary of the tag's attributes.
110 :param handle_empty_element: True if this tag is known to be
111 an empty-element tag (i.e. there is not expected to be any
112 closing tag).
113 """
114 # XXX namespace
115 attr_dict = {}
116 for key, value in attrs:
117 # Change None attribute values to the empty string
118 # for consistency with the other tree builders.
119 if value is None:
120 value = ''
121 if key in attr_dict:
122 # A single attribute shows up multiple times in this
123 # tag. How to handle it depends on the
124 # on_duplicate_attribute setting.
125 on_dupe = self.on_duplicate_attribute
126 if on_dupe == self.IGNORE:
127 pass
128 elif on_dupe in (None, self.REPLACE):
129 attr_dict[key] = value
130 else:
131 on_dupe(attr_dict, key, value)
132 else:
133 attr_dict[key] = value
134 attrvalue = '""'
135 #print("START", name)
136 sourceline, sourcepos = self.getpos()
137 tag = self.soup.handle_starttag(
138 name, None, None, attr_dict, sourceline=sourceline,
139 sourcepos=sourcepos
140 )
141 if tag and tag.is_empty_element and handle_empty_element:
142 # Unlike other parsers, html.parser doesn't send separate end tag
143 # events for empty-element tags. (It's handled in
144 # handle_startendtag, but only if the original markup looked like
145 # <tag/>.)
146 #
147 # So we need to call handle_endtag() ourselves. Since we
148 # know the start event is identical to the end event, we
149 # don't want handle_endtag() to cross off any previous end
150 # events for tags of this name.
151 self.handle_endtag(name, check_already_closed=False)
153 # But we might encounter an explicit closing tag for this tag
154 # later on. If so, we want to ignore it.
155 self.already_closed_empty_element.append(name)
157 if self._root_tag is None:
158 self._root_tag_encountered(name)
160 def handle_endtag(self, name, check_already_closed=True):
161 """Handle a closing tag, e.g. '</tag>'
163 :param name: A tag name.
164 :param check_already_closed: True if this tag is expected to
165 be the closing portion of an empty-element tag,
166 e.g. '<tag></tag>'.
167 """
168 #print("END", name)
169 if check_already_closed and name in self.already_closed_empty_element:
170 # This is a redundant end tag for an empty-element tag.
171 # We've already called handle_endtag() for it, so just
172 # check it off the list.
173 #print("ALREADY CLOSED", name)
174 self.already_closed_empty_element.remove(name)
175 else:
176 self.soup.handle_endtag(name)
178 def handle_data(self, data):
179 """Handle some textual data that shows up between tags."""
180 self.soup.handle_data(data)
182 def handle_charref(self, name):
183 """Handle a numeric character reference by converting it to the
184 corresponding Unicode character and treating it as textual
185 data.
187 :param name: Character number, possibly in hexadecimal.
188 """
189 # TODO: This was originally a workaround for a bug in
190 # HTMLParser. (http://bugs.python.org/issue13633) The bug has
191 # been fixed, but removing this code still makes some
192 # Beautiful Soup tests fail. This needs investigation.
193 if name.startswith('x'):
194 real_name = int(name.lstrip('x'), 16)
195 elif name.startswith('X'):
196 real_name = int(name.lstrip('X'), 16)
197 else:
198 real_name = int(name)
200 data = None
201 if real_name < 256:
202 # HTML numeric entities are supposed to reference Unicode
203 # code points, but sometimes they reference code points in
204 # some other encoding (ahem, Windows-1252). E.g. “
205 # instead of É for LEFT DOUBLE QUOTATION MARK. This
206 # code tries to detect this situation and compensate.
207 for encoding in (self.soup.original_encoding, 'windows-1252'):
208 if not encoding:
209 continue
210 try:
211 data = bytearray([real_name]).decode(encoding)
212 except UnicodeDecodeError as e:
213 pass
214 if not data:
215 try:
216 data = chr(real_name)
217 except (ValueError, OverflowError) as e:
218 pass
219 data = data or "\N{REPLACEMENT CHARACTER}"
220 self.handle_data(data)
222 def handle_entityref(self, name):
223 """Handle a named entity reference by converting it to the
224 corresponding Unicode character(s) and treating it as textual
225 data.
227 :param name: Name of the entity reference.
228 """
229 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
230 if character is not None:
231 data = character
232 else:
233 # If this were XML, it would be ambiguous whether "&foo"
234 # was an character entity reference with a missing
235 # semicolon or the literal string "&foo". Since this is
236 # HTML, we have a complete list of all character entity references,
237 # and this one wasn't found, so assume it's the literal string "&foo".
238 data = "&%s" % name
239 self.handle_data(data)
241 def handle_comment(self, data):
242 """Handle an HTML comment.
244 :param data: The text of the comment.
245 """
246 self.soup.endData()
247 self.soup.handle_data(data)
248 self.soup.endData(Comment)
250 def handle_decl(self, data):
251 """Handle a DOCTYPE declaration.
253 :param data: The text of the declaration.
254 """
255 self.soup.endData()
256 data = data[len("DOCTYPE "):]
257 self.soup.handle_data(data)
258 self.soup.endData(Doctype)
260 def unknown_decl(self, data):
261 """Handle a declaration of unknown type -- probably a CDATA block.
263 :param data: The text of the declaration.
264 """
265 if data.upper().startswith('CDATA['):
266 cls = CData
267 data = data[len('CDATA['):]
268 else:
269 cls = Declaration
270 self.soup.endData()
271 self.soup.handle_data(data)
272 self.soup.endData(cls)
274 def handle_pi(self, data):
275 """Handle a processing instruction.
277 :param data: The text of the instruction.
278 """
279 self.soup.endData()
280 self.soup.handle_data(data)
281 self._document_might_be_xml(data)
282 self.soup.endData(ProcessingInstruction)
285class HTMLParserTreeBuilder(HTMLTreeBuilder):
286 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
287 found in the Python standard library.
288 """
289 is_xml = False
290 picklable = True
291 NAME = HTMLPARSER
292 features = [NAME, HTML, STRICT]
294 # The html.parser knows which line number and position in the
295 # original file is the source of an element.
296 TRACKS_LINE_NUMBERS = True
298 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
299 """Constructor.
301 :param parser_args: Positional arguments to pass into
302 the BeautifulSoupHTMLParser constructor, once it's
303 invoked.
304 :param parser_kwargs: Keyword arguments to pass into
305 the BeautifulSoupHTMLParser constructor, once it's
306 invoked.
307 :param kwargs: Keyword arguments for the superclass constructor.
308 """
309 # Some keyword arguments will be pulled out of kwargs and placed
310 # into parser_kwargs.
311 extra_parser_kwargs = dict()
312 for arg in ('on_duplicate_attribute',):
313 if arg in kwargs:
314 value = kwargs.pop(arg)
315 extra_parser_kwargs[arg] = value
316 super(HTMLParserTreeBuilder, self).__init__(**kwargs)
317 parser_args = parser_args or []
318 parser_kwargs = parser_kwargs or {}
319 parser_kwargs.update(extra_parser_kwargs)
320 parser_kwargs['convert_charrefs'] = False
321 self.parser_args = (parser_args, parser_kwargs)
323 def prepare_markup(self, markup, user_specified_encoding=None,
324 document_declared_encoding=None, exclude_encodings=None):
326 """Run any preliminary steps necessary to make incoming markup
327 acceptable to the parser.
329 :param markup: Some markup -- probably a bytestring.
330 :param user_specified_encoding: The user asked to try this encoding.
331 :param document_declared_encoding: The markup itself claims to be
332 in this encoding.
333 :param exclude_encodings: The user asked _not_ to try any of
334 these encodings.
336 :yield: A series of 4-tuples:
337 (markup, encoding, declared encoding,
338 has undergone character replacement)
340 Each 4-tuple represents a strategy for converting the
341 document to Unicode and parsing it. Each strategy will be tried
342 in turn.
343 """
344 if isinstance(markup, str):
345 # Parse Unicode as-is.
346 yield (markup, None, None, False)
347 return
349 # Ask UnicodeDammit to sniff the most likely encoding.
351 # This was provided by the end-user; treat it as a known
352 # definite encoding per the algorithm laid out in the HTML5
353 # spec. (See the EncodingDetector class for details.)
354 known_definite_encodings = [user_specified_encoding]
356 # This was found in the document; treat it as a slightly lower-priority
357 # user encoding.
358 user_encodings = [document_declared_encoding]
360 try_encodings = [user_specified_encoding, document_declared_encoding]
361 dammit = UnicodeDammit(
362 markup,
363 known_definite_encodings=known_definite_encodings,
364 user_encodings=user_encodings,
365 is_html=True,
366 exclude_encodings=exclude_encodings
367 )
368 yield (dammit.markup, dammit.original_encoding,
369 dammit.declared_html_encoding,
370 dammit.contains_replacement_characters)
372 def feed(self, markup):
373 """Run some incoming markup through some parsing process,
374 populating the `BeautifulSoup` object in self.soup.
375 """
376 args, kwargs = self.parser_args
377 parser = BeautifulSoupHTMLParser(*args, **kwargs)
378 parser.soup = self.soup
379 try:
380 parser.feed(markup)
381 except AssertionError as e:
382 # html.parser raises AssertionError in rare cases to
383 # indicate a fatal problem with the markup, especially
384 # when there's an error in the doctype declaration.
385 raise ParserRejectedMarkup(e)
386 parser.close()
387 parser.already_closed_empty_element = []