1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4__all__ = [
5 "HTML5TreeBuilder",
6]
7
8from typing import (
9 Any,
10 cast,
11 Dict,
12 Iterable,
13 Optional,
14 Sequence,
15 TYPE_CHECKING,
16 Tuple,
17 Union,
18)
19from typing_extensions import TypeAlias
20from bs4._typing import (
21 _AttributeValue,
22 _AttributeValues,
23 _Encoding,
24 _Encodings,
25 _NamespaceURL,
26 _RawMarkup,
27)
28
29import warnings
30from bs4.builder import (
31 DetectsXMLParsedAsHTML,
32 PERMISSIVE,
33 HTML,
34 HTML_5,
35 HTMLTreeBuilder,
36)
37from bs4.element import (
38 NamespacedAttribute,
39 PageElement,
40 nonwhitespace_re,
41)
42import html5lib
43from html5lib.constants import (
44 namespaces,
45)
46from bs4.element import (
47 Comment,
48 Doctype,
49 NavigableString,
50 Tag,
51)
52
53if TYPE_CHECKING:
54 from bs4 import BeautifulSoup
55
56from html5lib.treebuilders import base as treebuilder_base
57
58
59class HTML5TreeBuilder(HTMLTreeBuilder):
60 """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
61 build a tree.
62
63 Note that `HTML5TreeBuilder` does not support some common HTML
64 `TreeBuilder` features. Some of these features could theoretically
65 be implemented, but at the very least it's quite difficult,
66 because html5lib moves the parse tree around as it's being built.
67
68 Specifically:
69
70 * This `TreeBuilder` doesn't use different subclasses of
71 `NavigableString` (e.g. `Script`) based on the name of the tag
72 in which the string was found.
73 * You can't use a `SoupStrainer` to parse only part of a document.
74 """
75
76 NAME: str = "html5lib"
77
78 features: Iterable[str] = [NAME, PERMISSIVE, HTML_5, HTML]
79
80 #: html5lib can tell us which line number and position in the
81 #: original file is the source of an element.
82 TRACKS_LINE_NUMBERS: bool = True
83
84 underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:
85 user_specified_encoding: Optional[_Encoding]
86
87 def prepare_markup(
88 self,
89 markup: _RawMarkup,
90 user_specified_encoding: Optional[_Encoding] = None,
91 document_declared_encoding: Optional[_Encoding] = None,
92 exclude_encodings: Optional[_Encodings] = None,
93 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
94 # Store the user-specified encoding for use later on.
95 self.user_specified_encoding = user_specified_encoding
96
97 # document_declared_encoding and exclude_encodings aren't used
98 # ATM because the html5lib TreeBuilder doesn't use
99 # UnicodeDammit.
100 for variable, name in (
101 (document_declared_encoding, "document_declared_encoding"),
102 (exclude_encodings, "exclude_encodings"),
103 ):
104 if variable:
105 warnings.warn(
106 f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
107 stacklevel=3,
108 )
109
110 # html5lib only parses HTML, so if it's given XML that's worth
111 # noting.
112 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
113
114 yield (markup, None, None, False)
115
116 # These methods are defined by Beautiful Soup.
117 def feed(self, markup: _RawMarkup) -> None:
118 """Run some incoming markup through some parsing process,
119 populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
120 """
121 if self.soup is not None and self.soup.parse_only is not None:
122 warnings.warn(
123 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
124 stacklevel=4,
125 )
126
127 # self.underlying_builder is probably None now, but it'll be set
128 # when html5lib calls self.create_treebuilder().
129 parser = html5lib.HTMLParser(tree=self.create_treebuilder)
130 assert self.underlying_builder is not None
131 self.underlying_builder.parser = parser
132 extra_kwargs = dict()
133 if not isinstance(markup, str):
134 # kwargs, specifically override_encoding, will eventually
135 # be passed in to html5lib's
136 # HTMLBinaryInputStream.__init__.
137 extra_kwargs["override_encoding"] = self.user_specified_encoding
138
139 doc = parser.parse(markup, **extra_kwargs) # type:ignore
140
141 # Set the character encoding detected by the tokenizer.
142 if isinstance(markup, str):
143 # We need to special-case this because html5lib sets
144 # charEncoding to UTF-8 if it gets Unicode input.
145 doc.original_encoding = None
146 else:
147 original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore
148 # The encoding is an html5lib Encoding object. We want to
149 # use a string for compatibility with other tree builders.
150 original_encoding = original_encoding.name
151 doc.original_encoding = original_encoding
152 self.underlying_builder.parser = None
153
154 def create_treebuilder(
155 self, namespaceHTMLElements: bool
156 ) -> "TreeBuilderForHtml5lib":
157 """Called by html5lib to instantiate the kind of class it
158 calls a 'TreeBuilder'.
159
160 :param namespaceHTMLElements: Whether or not to namespace HTML elements.
161
162 :meta private:
163 """
164 self.underlying_builder = TreeBuilderForHtml5lib(
165 namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
166 )
167 return self.underlying_builder
168
169 def test_fragment_to_document(self, fragment: str) -> str:
170 """See `TreeBuilder`."""
171 return "<html><head></head><body>%s</body></html>" % fragment
172
173
174class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
175 soup: "BeautifulSoup" #: :meta private:
176 parser: Optional[html5lib.HTMLParser] #: :meta private:
177
178 def __init__(
179 self,
180 namespaceHTMLElements: bool,
181 soup: Optional["BeautifulSoup"] = None,
182 store_line_numbers: bool = True,
183 **kwargs: Any,
184 ):
185 if soup:
186 self.soup = soup
187 else:
188 warnings.warn(
189 "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
190 DeprecationWarning,
191 stacklevel=2,
192 )
193 from bs4 import BeautifulSoup
194
195 # TODO: Why is the parser 'html.parser' here? Using
196 # html5lib doesn't cause an infinite loop and is more
197 # accurate. Best to get rid of this entire section, I think.
198 self.soup = BeautifulSoup(
199 "", "html.parser", store_line_numbers=store_line_numbers, **kwargs
200 )
201 # TODO: What are **kwargs exactly? Should they be passed in
202 # here in addition to/instead of being passed to the BeautifulSoup
203 # constructor?
204 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
205
206 # This will be set later to a real html5lib HTMLParser object,
207 # which we can use to track the current line number.
208 self.parser = None
209 self.store_line_numbers = store_line_numbers
210
211 def documentClass(self) -> "Element":
212 self.soup.reset()
213 return Element(self.soup, self.soup, None)
214
215 def insertDoctype(self, token: Dict[str, Any]) -> None:
216 name: str = cast(str, token["name"])
217 publicId: Optional[str] = cast(Optional[str], token["publicId"])
218 systemId: Optional[str] = cast(Optional[str], token["systemId"])
219
220 doctype = Doctype.for_name_and_ids(name, publicId, systemId)
221 self.soup.object_was_parsed(doctype)
222
223 def elementClass(self, name: str, namespace: str) -> "Element":
224 sourceline: Optional[int] = None
225 sourcepos: Optional[int] = None
226 if self.parser is not None and self.store_line_numbers:
227 # This represents the point immediately after the end of the
228 # tag. We don't know when the tag started, but we do know
229 # where it ended -- the character just before this one.
230 sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore
231 assert sourcepos is not None
232 sourcepos = sourcepos - 1
233 tag = self.soup.new_tag(
234 name, namespace, sourceline=sourceline, sourcepos=sourcepos
235 )
236
237 return Element(tag, self.soup, namespace)
238
239 def commentClass(self, data: str) -> "TextNode":
240 return TextNode(Comment(data), self.soup)
241
242 def fragmentClass(self) -> "Element":
243 """This is only used by html5lib HTMLParser.parseFragment(),
244 which is never used by Beautiful Soup, only by the html5lib
245 unit tests. Since we don't currently hook into those tests,
246 the implementation is left blank.
247 """
248 raise NotImplementedError()
249
250 def getFragment(self) -> "Element":
251 """This is only used by the html5lib unit tests. Since we
252 don't currently hook into those tests, the implementation is
253 left blank.
254 """
255 raise NotImplementedError()
256
257 def appendChild(self, node: "Element") -> None:
258 # TODO: This code is not covered by the BS4 tests, and
259 # apparently not triggered by the html5lib test suite either.
260 # But it doesn't seem test-specific and there are calls to it
261 # (or a method with the same name) all over html5lib, so I'm
262 # leaving the implementation in place rather than replacing it
263 # with NotImplementedError()
264 self.soup.append(node.element)
265
266 def getDocument(self) -> "BeautifulSoup":
267 return self.soup
268
269 def testSerializer(self, node: "Element") -> None:
270 """This is only used by the html5lib unit tests. Since we
271 don't currently hook into those tests, the implementation is
272 left blank.
273 """
274 raise NotImplementedError()
275
276
277class AttrList(object):
278 """Represents a Tag's attributes in a way compatible with html5lib."""
279
280 element: Tag
281 attrs: _AttributeValues
282
283 def __init__(self, element: Tag):
284 self.element = element
285 self.attrs = dict(self.element.attrs)
286
287 def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
288 return list(self.attrs.items()).__iter__()
289
290 def __setitem__(self, name: str, value: _AttributeValue) -> None:
291 # If this attribute is a multi-valued attribute for this element,
292 # turn its value into a list.
293 list_attr = self.element.cdata_list_attributes or {}
294 if name in list_attr.get("*", []) or (
295 self.element.name in list_attr
296 and name in list_attr.get(self.element.name, [])
297 ):
298 # A node that is being cloned may have already undergone
299 # this procedure. Check for this and skip it.
300 if not isinstance(value, list):
301 assert isinstance(value, str)
302 value = self.element.attribute_value_list_class(
303 nonwhitespace_re.findall(value)
304 )
305 self.element[name] = value
306
307 def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
308 return list(self.attrs.items())
309
310 def keys(self) -> Iterable[str]:
311 return list(self.attrs.keys())
312
313 def __len__(self) -> int:
314 return len(self.attrs)
315
316 def __getitem__(self, name: str) -> _AttributeValue:
317 return self.attrs[name]
318
319 def __contains__(self, name: str) -> bool:
320 return name in list(self.attrs.keys())
321
322
323class BeautifulSoupNode(treebuilder_base.Node):
324 # A node can correspond to _either_ a Tag _or_ a NavigableString.
325 tag: Optional[Tag]
326 string: Optional[NavigableString]
327 soup: "BeautifulSoup"
328 namespace: Optional[_NamespaceURL]
329
330 @property
331 def element(self) -> PageElement:
332 assert self.tag is not None or self.string is not None
333 if self.tag is not None:
334 return self.tag
335 else:
336 assert self.string is not None
337 return self.string
338
339 @property
340 def nodeType(self) -> int:
341 """Return the html5lib constant corresponding to the type of
342 the underlying DOM object.
343
344 NOTE: This property is only accessed by the html5lib test
345 suite, not by Beautiful Soup proper.
346 """
347 raise NotImplementedError()
348
349 # TODO-TYPING: typeshed stubs are incorrect about this;
350 # cloneNode returns a new Node, not None.
351 def cloneNode(self) -> treebuilder_base.Node: # type:ignore
352 raise NotImplementedError()
353
354
355class Element(BeautifulSoupNode):
356 namespace: Optional[_NamespaceURL]
357
358 def __init__(
359 self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
360 ):
361 self.tag = element
362 self.string = None
363 self.soup = soup
364 self.namespace = namespace
365 treebuilder_base.Node.__init__(self, element.name)
366
367 def appendChild(self, node: "BeautifulSoupNode") -> None:
368 string_child: Optional[NavigableString] = None
369 child: PageElement
370 if type(node.string) is NavigableString:
371 # We check for NavigableString *only* because we want to avoid
372 # joining PreformattedStrings, such as Comments, with nearby strings.
373 string_child = child = node.string
374 else:
375 child = node.element
376 node.parent = self
377
378 if (
379 child is not None
380 and child.parent is not None
381 and not isinstance(child, str)
382 ):
383 node.element.extract()
384
385 if (
386 string_child is not None
387 and self.tag is not None and self.tag.contents
388 and type(self.tag.contents[-1]) is NavigableString
389 ):
390 # We are appending a string onto another string.
391 # TODO This has O(n^2) performance, for input like
392 # "a</a>a</a>a</a>..."
393 old_element = self.tag.contents[-1]
394 new_element = self.soup.new_string(old_element + string_child)
395 old_element.replace_with(new_element)
396 self.soup._most_recent_element = new_element
397 else:
398 if isinstance(node, str):
399 # Create a brand new NavigableString from this string.
400 child = self.soup.new_string(node)
401
402 # Tell Beautiful Soup to act as if it parsed this element
403 # immediately after the parent's last descendant. (Or
404 # immediately after the parent, if it has no children.)
405 if self.tag is not None and self.tag.contents:
406 most_recent_element = self.tag._last_descendant(False)
407 elif self.element.next_element is not None:
408 # Something from further ahead in the parse tree is
409 # being inserted into this earlier element. This is
410 # very annoying because it means an expensive search
411 # for the last element in the tree.
412 most_recent_element = self.soup._last_descendant()
413 else:
414 most_recent_element = self.element
415
416 self.soup.object_was_parsed(
417 child, parent=self.tag, most_recent_element=most_recent_element
418 )
419
420 def getAttributes(self) -> AttrList:
421 assert self.tag is not None
422 return AttrList(self.tag)
423
424 # An HTML5lib attribute name may either be a single string,
425 # or a tuple (namespace, name).
426 _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
427 # Now we can define the type this method accepts as a dictionary
428 # mapping those attribute names to single string values.
429 _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
430
431 def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
432 assert self.tag is not None
433 if attributes is not None and len(attributes) > 0:
434 # Replace any namespaced attributes with
435 # NamespacedAttribute objects.
436 for name, value in list(attributes.items()):
437 if isinstance(name, tuple):
438 new_name = NamespacedAttribute(*name)
439 del attributes[name]
440 attributes[new_name] = value
441
442 # We can now cast attributes to the type of Dict
443 # used by Beautiful Soup.
444 normalized_attributes = cast(_AttributeValues, attributes)
445
446 # Values for tags like 'class' came in as single strings;
447 # replace them with lists of strings as appropriate.
448 self.soup.builder._replace_cdata_list_attribute_values(
449 self.name, normalized_attributes
450 )
451
452 # Then set the attributes on the Tag associated with this
453 # BeautifulSoupNode.
454 for name, value_or_values in list(normalized_attributes.items()):
455 self.tag[name] = value_or_values
456
457 # The attributes may contain variables that need substitution.
458 # Call set_up_substitutions manually.
459 #
460 # The Tag constructor called this method when the Tag was created,
461 # but we just set/changed the attributes, so call it again.
462 self.soup.builder.set_up_substitutions(self.tag)
463
464 attributes = property(getAttributes, setAttributes)
465
466 def insertText(
467 self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
468 ) -> None:
469 text = TextNode(self.soup.new_string(data), self.soup)
470 if insertBefore:
471 self.insertBefore(text, insertBefore)
472 else:
473 self.appendChild(text)
474
475 def insertBefore(
476 self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
477 ) -> None:
478 assert self.tag is not None
479 index = self.tag.index(refNode.element)
480 if (
481 type(node.element) is NavigableString
482 and self.tag.contents
483 and type(self.tag.contents[index - 1]) is NavigableString
484 ):
485 # (See comments in appendChild)
486 old_node = self.tag.contents[index - 1]
487 assert type(old_node) is NavigableString
488 new_str = self.soup.new_string(old_node + node.element)
489 old_node.replace_with(new_str)
490 else:
491 self.tag.insert(index, node.element)
492 node.parent = self
493
494 def removeChild(self, node: "Element") -> None:
495 node.element.extract()
496
497 def reparentChildren(self, newParent: "Element") -> None:
498 """Move all of this tag's children into another tag."""
499 # print("MOVE", self.element.contents)
500 # print("FROM", self.element)
501 # print("TO", new_parent.element)
502
503 element = self.tag
504 assert element is not None
505 new_parent_element = newParent.tag
506 assert new_parent_element is not None
507 # Determine what this tag's next_element will be once all the children
508 # are removed.
509 final_next_element = element.next_sibling
510
511 new_parents_last_descendant = new_parent_element._last_descendant(False, False)
512 if len(new_parent_element.contents) > 0:
513 # The new parent already contains children. We will be
514 # appending this tag's children to the end.
515
516 # We can make this assertion since we know new_parent has
517 # children.
518 assert new_parents_last_descendant is not None
519 new_parents_last_child = new_parent_element.contents[-1]
520 new_parents_last_descendant_next_element = (
521 new_parents_last_descendant.next_element
522 )
523 else:
524 # The new parent contains no children.
525 new_parents_last_child = None
526 new_parents_last_descendant_next_element = new_parent_element.next_element
527
528 to_append = element.contents
529 if len(to_append) > 0:
530 # Set the first child's previous_element and previous_sibling
531 # to elements within the new parent
532 first_child = to_append[0]
533 if new_parents_last_descendant is not None:
534 first_child.previous_element = new_parents_last_descendant
535 else:
536 first_child.previous_element = new_parent_element
537 first_child.previous_sibling = new_parents_last_child
538 if new_parents_last_descendant is not None:
539 new_parents_last_descendant.next_element = first_child
540 else:
541 new_parent_element.next_element = first_child
542 if new_parents_last_child is not None:
543 new_parents_last_child.next_sibling = first_child
544
545 # Find the very last element being moved. It is now the
546 # parent's last descendant. It has no .next_sibling and
547 # its .next_element is whatever the previous last
548 # descendant had.
549 last_childs_last_descendant = to_append[-1]._last_descendant(
550 is_initialized=False, accept_self=True
551 )
552
553 # Since we passed accept_self=True into _last_descendant,
554 # there's no possibility that the result is None.
555 assert last_childs_last_descendant is not None
556 last_childs_last_descendant.next_element = (
557 new_parents_last_descendant_next_element
558 )
559 if new_parents_last_descendant_next_element is not None:
560 # TODO-COVERAGE: This code has no test coverage and
561 # I'm not sure how to get html5lib to go through this
562 # path, but it's just the other side of the previous
563 # line.
564 new_parents_last_descendant_next_element.previous_element = (
565 last_childs_last_descendant
566 )
567 last_childs_last_descendant.next_sibling = None
568
569 for child in to_append:
570 child.parent = new_parent_element
571 new_parent_element.contents.append(child)
572
573 # Now that this element has no children, change its .next_element.
574 element.contents = []
575 element.next_element = final_next_element
576
577 # print("DONE WITH MOVE")
578 # print("FROM", self.element)
579 # print("TO", new_parent_element)
580
581 # TODO-TYPING: typeshed stubs are incorrect about this;
582 # hasContent returns a boolean, not None.
583 def hasContent(self) -> bool: # type:ignore
584 return self.tag is None or len(self.tag.contents) > 0
585
586 # TODO-TYPING: typeshed stubs are incorrect about this;
587 # cloneNode returns a new Node, not None.
588 def cloneNode(self) -> treebuilder_base.Node: # type:ignore
589 assert self.tag is not None
590 tag = self.soup.new_tag(self.tag.name, self.namespace)
591 node = Element(tag, self.soup, self.namespace)
592 for key, value in self.attributes:
593 node.attributes[key] = value
594 return node
595
596 def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
597 if self.namespace is None:
598 return namespaces["html"], self.name
599 else:
600 return self.namespace, self.name
601
602 nameTuple = property(getNameTuple)
603
604
605class TextNode(BeautifulSoupNode):
606
607 def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
608 treebuilder_base.Node.__init__(self, None)
609 self.tag = None
610 self.string = element
611 self.soup = soup