Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/

1# Use of this source code is governed by the MIT license.

2__license__ = "MIT"

4__all__ = [

5 "HTML5TreeBuilder",

8from typing import (

9 Any,

10 cast,

11 Dict,

12 Iterable,

13 Optional,

14 Sequence,

15 TYPE_CHECKING,

16 Tuple,

17 Union,

18)

19from typing_extensions import TypeAlias

20from bs4._typing import (

21 _AttributeValue,

22 _AttributeValues,

23 _Encoding,

24 _Encodings,

25 _NamespaceURL,

26 _RawMarkup,

27)

29import warnings

30from bs4.builder import (

31 DetectsXMLParsedAsHTML,

32 PERMISSIVE,

33 HTML,

34 HTML_5,

35 HTMLTreeBuilder,

36)

37from bs4.element import (

38 NamespacedAttribute,

39 PageElement,

40 nonwhitespace_re,

41)

42import html5lib

43from html5lib.constants import (

44 namespaces,

45)

46from bs4.element import (

47 Comment,

48 Doctype,

49 NavigableString,

50 Tag,

51)

53if TYPE_CHECKING:

54 from bs4 import BeautifulSoup

56from html5lib.treebuilders import base as treebuilder_base

59class HTML5TreeBuilder(HTMLTreeBuilder):

60 """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to

61 build a tree.

63 Note that `HTML5TreeBuilder` does not support some common HTML

64 `TreeBuilder` features. Some of these features could theoretically

65 be implemented, but at the very least it's quite difficult,

66 because html5lib moves the parse tree around as it's being built.

68 Specifically:

70 * This `TreeBuilder` doesn't use different subclasses of

71 `NavigableString` (e.g. `Script`) based on the name of the tag

72 in which the string was found.

73 * You can't use a `SoupStrainer` to parse only part of a document.

74 """

76 NAME: str = "html5lib"

78 features: Iterable[str] = [NAME, PERMISSIVE, HTML_5, HTML]

80 #: html5lib can tell us which line number and position in the

81 #: original file is the source of an element.

82 TRACKS_LINE_NUMBERS: bool = True

84 underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:

85 user_specified_encoding: Optional[_Encoding]

87 def prepare_markup(

88 self,

89 markup: _RawMarkup,

90 user_specified_encoding: Optional[_Encoding] = None,

91 document_declared_encoding: Optional[_Encoding] = None,

92 exclude_encodings: Optional[_Encodings] = None,

93 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:

94 # Store the user-specified encoding for use later on.

95 self.user_specified_encoding = user_specified_encoding

97 # document_declared_encoding and exclude_encodings aren't used

98 # ATM because the html5lib TreeBuilder doesn't use

99 # UnicodeDammit.

100 for variable, name in (

101 (document_declared_encoding, "document_declared_encoding"),

102 (exclude_encodings, "exclude_encodings"),

103 ):

104 if variable:

105 warnings.warn(

106 f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",

107 stacklevel=3,

108 )

109

110 # html5lib only parses HTML, so if it's given XML that's worth

111 # noting.

112 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)

113

114 yield (markup, None, None, False)

115

116 # These methods are defined by Beautiful Soup.

117 def feed(self, markup: _RawMarkup) -> None:

118 """Run some incoming markup through some parsing process,

119 populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.

120 """

121 if self.soup is not None and self.soup.parse_only is not None:

122 warnings.warn(

123 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",

124 stacklevel=4,

125 )

126

127 # self.underlying_builder is probably None now, but it'll be set

128 # when html5lib calls self.create_treebuilder().

129 parser = html5lib.HTMLParser(tree=self.create_treebuilder)

130 assert self.underlying_builder is not None

131 self.underlying_builder.parser = parser

132 extra_kwargs = dict()

133 if not isinstance(markup, str):

134 # kwargs, specifically override_encoding, will eventually

135 # be passed in to html5lib's

136 # HTMLBinaryInputStream.__init__.

137 extra_kwargs["override_encoding"] = self.user_specified_encoding

138

139 doc = parser.parse(markup, **extra_kwargs) # type:ignore

140

141 # Set the character encoding detected by the tokenizer.

142 if isinstance(markup, str):

143 # We need to special-case this because html5lib sets

144 # charEncoding to UTF-8 if it gets Unicode input.

145 doc.original_encoding = None

146 else:

147 original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore

148 # The encoding is an html5lib Encoding object. We want to

149 # use a string for compatibility with other tree builders.

150 original_encoding = original_encoding.name

151 doc.original_encoding = original_encoding

152 self.underlying_builder.parser = None

153

154 def create_treebuilder(

155 self, namespaceHTMLElements: bool

156 ) -> "TreeBuilderForHtml5lib":

157 """Called by html5lib to instantiate the kind of class it

158 calls a 'TreeBuilder'.

159

160 :param namespaceHTMLElements: Whether or not to namespace HTML elements.

161

162 :meta private:

163 """

164 self.underlying_builder = TreeBuilderForHtml5lib(

165 namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers

166 )

167 return self.underlying_builder

168

169 def test_fragment_to_document(self, fragment: str) -> str:

170 """See `TreeBuilder`."""

171 return "<html><head></head><body>%s</body></html>" % fragment

172

173

174class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):

175 soup: "BeautifulSoup" #: :meta private:

176 parser: Optional[html5lib.HTMLParser] #: :meta private:

177

178 def __init__(

179 self,

180 namespaceHTMLElements: bool,

181 soup: Optional["BeautifulSoup"] = None,

182 store_line_numbers: bool = True,

183 **kwargs: Any,

184 ):

185 if soup:

186 self.soup = soup

187 else:

188 warnings.warn(

189 "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",

190 DeprecationWarning,

191 stacklevel=2,

192 )

193 from bs4 import BeautifulSoup

194

195 # TODO: Why is the parser 'html.parser' here? Using

196 # html5lib doesn't cause an infinite loop and is more

197 # accurate. Best to get rid of this entire section, I think.

198 self.soup = BeautifulSoup(

199 "", "html.parser", store_line_numbers=store_line_numbers, **kwargs

200 )

201 # TODO: What are **kwargs exactly? Should they be passed in