1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4__all__ = [
5 "HTML5TreeBuilder",
6]
7
8from typing import (
9 Any,
10 cast,
11 Dict,
12 Iterable,
13 Optional,
14 Sequence,
15 TYPE_CHECKING,
16 Tuple,
17 Union,
18)
19from typing_extensions import TypeAlias
20from bs4._typing import (
21 _AttributeValue,
22 _AttributeValues,
23 _Encoding,
24 _Encodings,
25 _NamespaceURL,
26 _RawMarkup,
27)
28
29import warnings
30from bs4.builder import (
31 DetectsXMLParsedAsHTML,
32 PERMISSIVE,
33 HTML,
34 HTML_5,
35 HTMLTreeBuilder,
36)
37from bs4.element import (
38 NamespacedAttribute,
39 PageElement,
40 nonwhitespace_re,
41)
42import html5lib
43from html5lib.constants import (
44 namespaces,
45)
46from bs4.element import (
47 Comment,
48 Doctype,
49 NavigableString,
50 Tag,
51)
52
53if TYPE_CHECKING:
54 from bs4 import BeautifulSoup
55
56from html5lib.treebuilders import base as treebuilder_base
57
58
59class HTML5TreeBuilder(HTMLTreeBuilder):
60 """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
61 build a tree.
62
63 Note that `HTML5TreeBuilder` does not support some common HTML
64 `TreeBuilder` features. Some of these features could theoretically
65 be implemented, but at the very least it's quite difficult,
66 because html5lib moves the parse tree around as it's being built.
67
68 Specifically:
69
70 * This `TreeBuilder` doesn't use different subclasses of
71 `NavigableString` (e.g. `Script`) based on the name of the tag
72 in which the string was found.
73 * You can't use a `SoupStrainer` to parse only part of a document.
74 """
75
76 NAME: str = "html5lib"
77
78 features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML]
79
80 #: html5lib can tell us which line number and position in the
81 #: original file is the source of an element.
82 TRACKS_LINE_NUMBERS: bool = True
83
84 underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:
85 user_specified_encoding: Optional[_Encoding]
86
87 def prepare_markup(
88 self,
89 markup: _RawMarkup,
90 user_specified_encoding: Optional[_Encoding] = None,
91 document_declared_encoding: Optional[_Encoding] = None,
92 exclude_encodings: Optional[_Encodings] = None,
93 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
94 # Store the user-specified encoding for use later on.
95 self.user_specified_encoding = user_specified_encoding
96
97 # document_declared_encoding and exclude_encodings aren't used
98 # ATM because the html5lib TreeBuilder doesn't use
99 # UnicodeDammit.
100 for variable, name in (
101 (document_declared_encoding, "document_declared_encoding"),
102 (exclude_encodings, "exclude_encodings"),
103 ):
104 if variable:
105 warnings.warn(
106 f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
107 stacklevel=3,
108 )
109
110 # html5lib only parses HTML, so if it's given XML that's worth
111 # noting.
112 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
113
114 yield (markup, None, None, False)
115
116 # These methods are defined by Beautiful Soup.
117 def feed(self, markup: _RawMarkup) -> None:
118 """Run some incoming markup through some parsing process,
119 populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
120 """
121 if self.soup is not None and self.soup.parse_only is not None:
122 warnings.warn(
123 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
124 stacklevel=4,
125 )
126
127 # self.underlying_builder is probably None now, but it'll be set
128 # when html5lib calls self.create_treebuilder().
129 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
130 assert self.underlying_builder is not None
131 self.underlying_builder.parser = parser
132 extra_kwargs = dict()
133 if not isinstance(markup, str):
134 # kwargs, specifically override_encoding, will eventually
135 # be passed in to html5lib's
136 # HTMLBinaryInputStream.__init__.
137 extra_kwargs["override_encoding"] = self.user_specified_encoding
138
139 doc = parser.parse(markup, **extra_kwargs)
140
141 # Set the character encoding detected by the tokenizer.
142 if isinstance(markup, str):
143 # We need to special-case this because html5lib sets
144 # charEncoding to UTF-8 if it gets Unicode input.
145 doc.original_encoding = None
146 else:
147 original_encoding = parser.tokenizer.stream.charEncoding[0]
148 # The encoding is an html5lib Encoding object. We want to
149 # use a string for compatibility with other tree builders.
150 original_encoding = original_encoding.name
151 doc.original_encoding = original_encoding
152 self.underlying_builder.parser = None
153
154 def create_treebuilder(
155 self, namespaceHTMLElements: bool
156 ) -> "TreeBuilderForHtml5lib":
157 """Called by html5lib to instantiate the kind of class it
158 calls a 'TreeBuilder'.
159
160 :param namespaceHTMLElements: Whether or not to namespace HTML elements.
161
162 :meta private:
163 """
164 self.underlying_builder = TreeBuilderForHtml5lib(
165 namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
166 )
167 return self.underlying_builder
168
169 def test_fragment_to_document(self, fragment: str) -> str:
170 """See `TreeBuilder`."""
171 return "<html><head></head><body>%s</body></html>" % fragment
172
173
174class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
175 soup: "BeautifulSoup" #: :meta private:
176 parser: Optional[html5lib.HTMLParser] #: :meta private:
177
178 def __init__(
179 self,
180 namespaceHTMLElements: bool,
181 soup: Optional["BeautifulSoup"] = None,
182 store_line_numbers: bool = True,
183 **kwargs: Any,
184 ):
185 if soup:
186 self.soup = soup
187 else:
188 warnings.warn(
189 "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
190 DeprecationWarning,
191 stacklevel=2,
192 )
193 from bs4 import BeautifulSoup
194
195 # TODO: Why is the parser 'html.parser' here? Using
196 # html5lib doesn't cause an infinite loop and is more
197 # accurate. Best to get rid of this entire section, I think.
198 self.soup = BeautifulSoup(
199 "", "html.parser", store_line_numbers=store_line_numbers, **kwargs
200 )
201 # TODO: What are **kwargs exactly? Should they be passed in
202 # here in addition to/instead of being passed to the BeautifulSoup
203 # constructor?
204 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
205
206 # This will be set later to a real html5lib HTMLParser object,
207 # which we can use to track the current line number.
208 self.parser = None
209 self.store_line_numbers = store_line_numbers
210
211 def documentClass(self) -> "Element":
212 self.soup.reset()
213 return Element(self.soup, self.soup, None)
214
215 def insertDoctype(self, token: Dict[str, Any]) -> None:
216 name: str = cast(str, token["name"])
217 publicId: Optional[str] = cast(Optional[str], token["publicId"])
218 systemId: Optional[str] = cast(Optional[str], token["systemId"])
219
220 doctype = Doctype.for_name_and_ids(name, publicId, systemId)
221 self.soup.object_was_parsed(doctype)
222
223 def elementClass(self, name: str, namespace: str) -> "Element":
224 sourceline: Optional[int] = None
225 sourcepos: Optional[int] = None
226 if self.parser is not None and self.store_line_numbers:
227 # This represents the point immediately after the end of the
228 # tag. We don't know when the tag started, but we do know
229 # where it ended -- the character just before this one.
230 sourceline, sourcepos = self.parser.tokenizer.stream.position()
231 assert sourcepos is not None
232 sourcepos = sourcepos - 1
233 tag = self.soup.new_tag(
234 name, namespace, sourceline=sourceline, sourcepos=sourcepos
235 )
236
237 return Element(tag, self.soup, namespace)
238
239 def commentClass(self, data: str) -> "TextNode":
240 return TextNode(Comment(data), self.soup)
241
242 def fragmentClass(self) -> "Element":
243 """This is only used by html5lib HTMLParser.parseFragment(),
244 which is never used by Beautiful Soup, only by the html5lib
245 unit tests. Since we don't currently hook into those tests,
246 the implementation is left blank.
247 """
248 raise NotImplementedError()
249
250 def getFragment(self) -> "Element":
251 """This is only used by the html5lib unit tests. Since we
252 don't currently hook into those tests, the implementation is
253 left blank.
254 """
255 raise NotImplementedError()
256
257 def appendChild(self, node: "Element") -> None:
258 # TODO: This code is not covered by the BS4 tests, and
259 # apparently not triggered by the html5lib test suite either.
260 # But it doesn't seem test-specific and there are calls to it
261 # (or a method with the same name) all over html5lib, so I'm
262 # leaving the implementation in place rather than replacing it
263 # with NotImplementedError()
264 self.soup.append(node.element)
265
266 def getDocument(self) -> "BeautifulSoup":
267 return self.soup
268
269 def testSerializer(self, element: "Element") -> str:
270 """This is only used by the html5lib unit tests. Since we
271 don't currently hook into those tests, the implementation is
272 left blank.
273 """
274 raise NotImplementedError()
275
276
277class AttrList(object):
278 """Represents a Tag's attributes in a way compatible with html5lib."""
279
280 element: Tag
281 attrs: _AttributeValues
282
283 def __init__(self, element: Tag):
284 self.element = element
285 self.attrs = dict(self.element.attrs)
286
287 def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
288 return list(self.attrs.items()).__iter__()
289
290 def __setitem__(self, name: str, value: _AttributeValue) -> None:
291 # If this attribute is a multi-valued attribute for this element,
292 # turn its value into a list.
293 list_attr = self.element.cdata_list_attributes or {}
294 if name in list_attr.get("*", []) or (
295 self.element.name in list_attr
296 and name in list_attr.get(self.element.name, [])
297 ):
298 # A node that is being cloned may have already undergone
299 # this procedure. Check for this and skip it.
300 if not isinstance(value, list):
301 assert isinstance(value, str)
302 value = self.element.attribute_value_list_class(
303 nonwhitespace_re.findall(value)
304 )
305 self.element[name] = value
306
307 def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
308 return list(self.attrs.items())
309
310 def keys(self) -> Iterable[str]:
311 return list(self.attrs.keys())
312
313 def __len__(self) -> int:
314 return len(self.attrs)
315
316 def __getitem__(self, name: str) -> _AttributeValue:
317 return self.attrs[name]
318
319 def __contains__(self, name: str) -> bool:
320 return name in list(self.attrs.keys())
321
322
323class BeautifulSoupNode(treebuilder_base.Node):
324 element: PageElement
325 soup: "BeautifulSoup"
326 namespace: Optional[_NamespaceURL]
327
328 @property
329 def nodeType(self) -> int:
330 """Return the html5lib constant corresponding to the type of
331 the underlying DOM object.
332
333 NOTE: This property is only accessed by the html5lib test
334 suite, not by Beautiful Soup proper.
335 """
336 raise NotImplementedError()
337
338 # TODO-TYPING: typeshed stubs are incorrect about this;
339 # cloneNode returns a new Node, not None.
340 def cloneNode(self) -> treebuilder_base.Node:
341 raise NotImplementedError()
342
343
344class Element(BeautifulSoupNode):
345 element: Tag
346 namespace: Optional[_NamespaceURL]
347
348 def __init__(
349 self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
350 ):
351 treebuilder_base.Node.__init__(self, element.name)
352 self.element = element
353 self.soup = soup
354 self.namespace = namespace
355
356 def appendChild(self, node: "BeautifulSoupNode") -> None:
357 string_child: Optional[NavigableString] = None
358 child: PageElement
359 if type(node.element) is NavigableString:
360 string_child = child = node.element
361 else:
362 child = node.element
363 node.parent = self
364
365 if (
366 child is not None
367 and child.parent is not None
368 and not isinstance(child, str)
369 ):
370 node.element.extract()
371
372 if (
373 string_child is not None
374 and self.element.contents
375 and type(self.element.contents[-1]) is NavigableString
376 ):
377 # We are appending a string onto another string.
378 # TODO This has O(n^2) performance, for input like
379 # "a</a>a</a>a</a>..."
380 old_element = self.element.contents[-1]
381 new_element = self.soup.new_string(old_element + string_child)
382 old_element.replace_with(new_element)
383 self.soup._most_recent_element = new_element
384 else:
385 if isinstance(node, str):
386 # Create a brand new NavigableString from this string.
387 child = self.soup.new_string(node)
388
389 # Tell Beautiful Soup to act as if it parsed this element
390 # immediately after the parent's last descendant. (Or
391 # immediately after the parent, if it has no children.)
392 if self.element.contents:
393 most_recent_element = self.element._last_descendant(False)
394 elif self.element.next_element is not None:
395 # Something from further ahead in the parse tree is
396 # being inserted into this earlier element. This is
397 # very annoying because it means an expensive search
398 # for the last element in the tree.
399 most_recent_element = self.soup._last_descendant()
400 else:
401 most_recent_element = self.element
402
403 self.soup.object_was_parsed(
404 child, parent=self.element, most_recent_element=most_recent_element
405 )
406
407 def getAttributes(self) -> AttrList:
408 if isinstance(self.element, Comment):
409 return {}
410 return AttrList(self.element)
411
412 # An HTML5lib attribute name may either be a single string,
413 # or a tuple (namespace, name).
414 _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
415 # Now we can define the type this method accepts as a dictionary
416 # mapping those attribute names to single string values.
417 _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
418
419 def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
420 if attributes is not None and len(attributes) > 0:
421 # Replace any namespaced attributes with
422 # NamespacedAttribute objects.
423 for name, value in list(attributes.items()):
424 if isinstance(name, tuple):
425 new_name = NamespacedAttribute(*name)
426 del attributes[name]
427 attributes[new_name] = value
428
429 # We can now cast attributes to the type of Dict
430 # used by Beautiful Soup.
431 normalized_attributes = cast(_AttributeValues, attributes)
432
433 # Values for tags like 'class' came in as single strings;
434 # replace them with lists of strings as appropriate.
435 self.soup.builder._replace_cdata_list_attribute_values(
436 self.name, normalized_attributes
437 )
438
439 # Then set the attributes on the Tag associated with this
440 # BeautifulSoupNode.
441 for name, value_or_values in list(normalized_attributes.items()):
442 self.element[name] = value_or_values
443
444 # The attributes may contain variables that need substitution.
445 # Call set_up_substitutions manually.
446 #
447 # The Tag constructor called this method when the Tag was created,
448 # but we just set/changed the attributes, so call it again.
449 self.soup.builder.set_up_substitutions(self.element)
450
451 attributes = property(getAttributes, setAttributes)
452
453 def insertText(
454 self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
455 ) -> None:
456 text = TextNode(self.soup.new_string(data), self.soup)
457 if insertBefore:
458 self.insertBefore(text, insertBefore)
459 else:
460 self.appendChild(text)
461
462 def insertBefore(
463 self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
464 ) -> None:
465 index = self.element.index(refNode.element)
466 if (
467 type(node.element) is NavigableString
468 and self.element.contents
469 and type(self.element.contents[index - 1]) is NavigableString
470 ):
471 # (See comments in appendChild)
472 old_node = self.element.contents[index - 1]
473 assert type(old_node) is NavigableString
474 new_str = self.soup.new_string(old_node + node.element)
475 old_node.replace_with(new_str)
476 else:
477 self.element.insert(index, node.element)
478 node.parent = self
479
480 def removeChild(self, node: "Element") -> None:
481 node.element.extract()
482
483 def reparentChildren(self, new_parent: "Element") -> None:
484 """Move all of this tag's children into another tag."""
485 # print("MOVE", self.element.contents)
486 # print("FROM", self.element)
487 # print("TO", new_parent.element)
488
489 element = self.element
490 new_parent_element = new_parent.element
491 # Determine what this tag's next_element will be once all the children
492 # are removed.
493 final_next_element = element.next_sibling
494
495 new_parents_last_descendant = new_parent_element._last_descendant(False, False)
496 if len(new_parent_element.contents) > 0:
497 # The new parent already contains children. We will be
498 # appending this tag's children to the end.
499
500 # We can make this assertion since we know new_parent has
501 # children.
502 assert new_parents_last_descendant is not None
503 new_parents_last_child = new_parent_element.contents[-1]
504 new_parents_last_descendant_next_element = (
505 new_parents_last_descendant.next_element
506 )
507 else:
508 # The new parent contains no children.
509 new_parents_last_child = None
510 new_parents_last_descendant_next_element = new_parent_element.next_element
511
512 to_append = element.contents
513 if len(to_append) > 0:
514 # Set the first child's previous_element and previous_sibling
515 # to elements within the new parent
516 first_child = to_append[0]
517 if new_parents_last_descendant is not None:
518 first_child.previous_element = new_parents_last_descendant
519 else:
520 first_child.previous_element = new_parent_element
521 first_child.previous_sibling = new_parents_last_child
522 if new_parents_last_descendant is not None:
523 new_parents_last_descendant.next_element = first_child
524 else:
525 new_parent_element.next_element = first_child
526 if new_parents_last_child is not None:
527 new_parents_last_child.next_sibling = first_child
528
529 # Find the very last element being moved. It is now the
530 # parent's last descendant. It has no .next_sibling and
531 # its .next_element is whatever the previous last
532 # descendant had.
533 last_childs_last_descendant = to_append[-1]._last_descendant(
534 is_initialized=False, accept_self=True
535 )
536
537 # Since we passed accept_self=True into _last_descendant,
538 # there's no possibility that the result is None.
539 assert last_childs_last_descendant is not None
540 last_childs_last_descendant.next_element = (
541 new_parents_last_descendant_next_element
542 )
543 if new_parents_last_descendant_next_element is not None:
544 # TODO-COVERAGE: This code has no test coverage and
545 # I'm not sure how to get html5lib to go through this
546 # path, but it's just the other side of the previous
547 # line.
548 new_parents_last_descendant_next_element.previous_element = (
549 last_childs_last_descendant
550 )
551 last_childs_last_descendant.next_sibling = None
552
553 for child in to_append:
554 child.parent = new_parent_element
555 new_parent_element.contents.append(child)
556
557 # Now that this element has no children, change its .next_element.
558 element.contents = []
559 element.next_element = final_next_element
560
561 # print("DONE WITH MOVE")
562 # print("FROM", self.element)
563 # print("TO", new_parent_element)
564
565 # TODO-TYPING: typeshed stubs are incorrect about this;
566 # hasContent returns a boolean, not None.
567 def hasContent(self) -> bool:
568 return len(self.element.contents) > 0
569
570 # TODO-TYPING: typeshed stubs are incorrect about this;
571 # cloneNode returns a new Node, not None.
572 def cloneNode(self) -> treebuilder_base.Node:
573 tag = self.soup.new_tag(self.element.name, self.namespace)
574 node = Element(tag, self.soup, self.namespace)
575 for key, value in self.attributes:
576 node.attributes[key] = value
577 return node
578
579 def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
580 if self.namespace is None:
581 return namespaces["html"], self.name
582 else:
583 return self.namespace, self.name
584
585 nameTuple = property(getNameTuple)
586
587
588class TextNode(BeautifulSoupNode):
589 element: NavigableString
590
591 def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
592 treebuilder_base.Node.__init__(self, None)
593 self.element = element
594 self.soup = soup