Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_lxml.py: 4%
174 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
4__all__ = [
5 'LXMLTreeBuilderForXML',
6 'LXMLTreeBuilder',
7 ]
9try:
10 from collections.abc import Callable # Python 3.6
11except ImportError as e:
12 from collections import Callable
14from io import BytesIO
15from io import StringIO
16from lxml import etree
17from bs4.element import (
18 Comment,
19 Doctype,
20 NamespacedAttribute,
21 ProcessingInstruction,
22 XMLProcessingInstruction,
23)
24from bs4.builder import (
25 DetectsXMLParsedAsHTML,
26 FAST,
27 HTML,
28 HTMLTreeBuilder,
29 PERMISSIVE,
30 ParserRejectedMarkup,
31 TreeBuilder,
32 XML)
33from bs4.dammit import EncodingDetector
35LXML = 'lxml'
37def _invert(d):
38 "Invert a dictionary."
39 return dict((v,k) for k, v in list(d.items()))
41class LXMLTreeBuilderForXML(TreeBuilder):
42 DEFAULT_PARSER_CLASS = etree.XMLParser
44 is_xml = True
45 processing_instruction_class = XMLProcessingInstruction
47 NAME = "lxml-xml"
48 ALTERNATE_NAMES = ["xml"]
50 # Well, it's permissive by XML parser standards.
51 features = [NAME, LXML, XML, FAST, PERMISSIVE]
53 CHUNK_SIZE = 512
55 # This namespace mapping is specified in the XML Namespace
56 # standard.
57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
61 # NOTE: If we parsed Element objects and looked at .sourceline,
62 # we'd be able to see the line numbers from the original document.
63 # But instead we build an XMLParser or HTMLParser object to serve
64 # as the target of parse messages, and those messages don't include
65 # line numbers.
66 # See: https://bugs.launchpad.net/lxml/+bug/1846906
68 def initialize_soup(self, soup):
69 """Let the BeautifulSoup object know about the standard namespace
70 mapping.
72 :param soup: A `BeautifulSoup`.
73 """
74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
75 self._register_namespaces(self.DEFAULT_NSMAPS)
77 def _register_namespaces(self, mapping):
78 """Let the BeautifulSoup object know about namespaces encountered
79 while parsing the document.
81 This might be useful later on when creating CSS selectors.
83 This will track (almost) all namespaces, even ones that were
84 only in scope for part of the document. If two namespaces have
85 the same prefix, only the first one encountered will be
86 tracked. Un-prefixed namespaces are not tracked.
88 :param mapping: A dictionary mapping namespace prefixes to URIs.
89 """
90 for key, value in list(mapping.items()):
91 # This is 'if key' and not 'if key is not None' because we
92 # don't track un-prefixed namespaces. Soupselect will
93 # treat an un-prefixed namespace as the default, which
94 # causes confusion in some cases.
95 if key and key not in self.soup._namespaces:
96 # Let the BeautifulSoup object know about a new namespace.
97 # If there are multiple namespaces defined with the same
98 # prefix, the first one in the document takes precedence.
99 self.soup._namespaces[key] = value
101 def default_parser(self, encoding):
102 """Find the default parser for the given encoding.
104 :param encoding: A string.
105 :return: Either a parser object or a class, which
106 will be instantiated with default arguments.
107 """
108 if self._default_parser is not None:
109 return self._default_parser
110 return etree.XMLParser(
111 target=self, strip_cdata=False, recover=True, encoding=encoding)
113 def parser_for(self, encoding):
114 """Instantiate an appropriate parser for the given encoding.
116 :param encoding: A string.
117 :return: A parser object such as an `etree.XMLParser`.
118 """
119 # Use the default parser.
120 parser = self.default_parser(encoding)
122 if isinstance(parser, Callable):
123 # Instantiate the parser with default arguments
124 parser = parser(
125 target=self, strip_cdata=False, recover=True, encoding=encoding
126 )
127 return parser
129 def __init__(self, parser=None, empty_element_tags=None, **kwargs):
130 # TODO: Issue a warning if parser is present but not a
131 # callable, since that means there's no way to create new
132 # parsers for different encodings.
133 self._default_parser = parser
134 if empty_element_tags is not None:
135 self.empty_element_tags = set(empty_element_tags)
136 self.soup = None
137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
141 def _getNsTag(self, tag):
142 # Split the namespace URL out of a fully-qualified lxml tag
143 # name. Copied from lxml's src/lxml/sax.py.
144 if tag[0] == '{':
145 return tuple(tag[1:].split('}', 1))
146 else:
147 return (None, tag)
149 def prepare_markup(self, markup, user_specified_encoding=None,
150 exclude_encodings=None,
151 document_declared_encoding=None):
152 """Run any preliminary steps necessary to make incoming markup
153 acceptable to the parser.
155 lxml really wants to get a bytestring and convert it to
156 Unicode itself. So instead of using UnicodeDammit to convert
157 the bytestring to Unicode using different encodings, this
158 implementation uses EncodingDetector to iterate over the
159 encodings, and tell lxml to try to parse the document as each
160 one in turn.
162 :param markup: Some markup -- hopefully a bytestring.
163 :param user_specified_encoding: The user asked to try this encoding.
164 :param document_declared_encoding: The markup itself claims to be
165 in this encoding.
166 :param exclude_encodings: The user asked _not_ to try any of
167 these encodings.
169 :yield: A series of 4-tuples:
170 (markup, encoding, declared encoding,
171 has undergone character replacement)
173 Each 4-tuple represents a strategy for converting the
174 document to Unicode and parsing it. Each strategy will be tried
175 in turn.
176 """
177 is_html = not self.is_xml
178 if is_html:
179 self.processing_instruction_class = ProcessingInstruction
180 # We're in HTML mode, so if we're given XML, that's worth
181 # noting.
182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
183 else:
184 self.processing_instruction_class = XMLProcessingInstruction
186 if isinstance(markup, str):
187 # We were given Unicode. Maybe lxml can parse Unicode on
188 # this system?
190 # TODO: This is a workaround for
191 # https://bugs.launchpad.net/lxml/+bug/1948551.
192 # We can remove it once the upstream issue is fixed.
193 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
194 markup = markup[1:]
195 yield markup, None, document_declared_encoding, False
197 if isinstance(markup, str):
198 # No, apparently not. Convert the Unicode to UTF-8 and
199 # tell lxml to parse it as UTF-8.
200 yield (markup.encode("utf8"), "utf8",
201 document_declared_encoding, False)
203 # This was provided by the end-user; treat it as a known
204 # definite encoding per the algorithm laid out in the HTML5
205 # spec. (See the EncodingDetector class for details.)
206 known_definite_encodings = [user_specified_encoding]
208 # This was found in the document; treat it as a slightly lower-priority
209 # user encoding.
210 user_encodings = [document_declared_encoding]
211 detector = EncodingDetector(
212 markup, known_definite_encodings=known_definite_encodings,
213 user_encodings=user_encodings, is_html=is_html,
214 exclude_encodings=exclude_encodings
215 )
216 for encoding in detector.encodings:
217 yield (detector.markup, encoding, document_declared_encoding, False)
219 def feed(self, markup):
220 if isinstance(markup, bytes):
221 markup = BytesIO(markup)
222 elif isinstance(markup, str):
223 markup = StringIO(markup)
225 # Call feed() at least once, even if the markup is empty,
226 # or the parser won't be initialized.
227 data = markup.read(self.CHUNK_SIZE)
228 try:
229 self.parser = self.parser_for(self.soup.original_encoding)
230 self.parser.feed(data)
231 while len(data) != 0:
232 # Now call feed() on the rest of the data, chunk by chunk.
233 data = markup.read(self.CHUNK_SIZE)
234 if len(data) != 0:
235 self.parser.feed(data)
236 self.parser.close()
237 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
238 raise ParserRejectedMarkup(e)
240 def close(self):
241 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
243 def start(self, name, attrs, nsmap={}):
244 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
245 attrs = dict(attrs)
246 nsprefix = None
247 # Invert each namespace map as it comes in.
248 if len(nsmap) == 0 and len(self.nsmaps) > 1:
249 # There are no new namespaces for this tag, but
250 # non-default namespaces are in play, so we need a
251 # separate tag stack to know when they end.
252 self.nsmaps.append(None)
253 elif len(nsmap) > 0:
254 # A new namespace mapping has come into play.
256 # First, Let the BeautifulSoup object know about it.
257 self._register_namespaces(nsmap)
259 # Then, add it to our running list of inverted namespace
260 # mappings.
261 self.nsmaps.append(_invert(nsmap))
263 # The currently active namespace prefixes have
264 # changed. Calculate the new mapping so it can be stored
265 # with all Tag objects created while these prefixes are in
266 # scope.
267 current_mapping = dict(self.active_namespace_prefixes[-1])
268 current_mapping.update(nsmap)
270 # We should not track un-prefixed namespaces as we can only hold one
271 # and it will be recognized as the default namespace by soupsieve,
272 # which may be confusing in some situations.
273 if '' in current_mapping:
274 del current_mapping['']
275 self.active_namespace_prefixes.append(current_mapping)
277 # Also treat the namespace mapping as a set of attributes on the
278 # tag, so we can recreate it later.
279 attrs = attrs.copy()
280 for prefix, namespace in list(nsmap.items()):
281 attribute = NamespacedAttribute(
282 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
283 attrs[attribute] = namespace
285 # Namespaces are in play. Find any attributes that came in
286 # from lxml with namespaces attached to their names, and
287 # turn then into NamespacedAttribute objects.
288 new_attrs = {}
289 for attr, value in list(attrs.items()):
290 namespace, attr = self._getNsTag(attr)
291 if namespace is None:
292 new_attrs[attr] = value
293 else:
294 nsprefix = self._prefix_for_namespace(namespace)
295 attr = NamespacedAttribute(nsprefix, attr, namespace)
296 new_attrs[attr] = value
297 attrs = new_attrs
299 namespace, name = self._getNsTag(name)
300 nsprefix = self._prefix_for_namespace(namespace)
301 self.soup.handle_starttag(
302 name, namespace, nsprefix, attrs,
303 namespaces=self.active_namespace_prefixes[-1]
304 )
306 def _prefix_for_namespace(self, namespace):
307 """Find the currently active prefix for the given namespace."""
308 if namespace is None:
309 return None
310 for inverted_nsmap in reversed(self.nsmaps):
311 if inverted_nsmap is not None and namespace in inverted_nsmap:
312 return inverted_nsmap[namespace]
313 return None
315 def end(self, name):
316 self.soup.endData()
317 completed_tag = self.soup.tagStack[-1]
318 namespace, name = self._getNsTag(name)
319 nsprefix = None
320 if namespace is not None:
321 for inverted_nsmap in reversed(self.nsmaps):
322 if inverted_nsmap is not None and namespace in inverted_nsmap:
323 nsprefix = inverted_nsmap[namespace]
324 break
325 self.soup.handle_endtag(name, nsprefix)
326 if len(self.nsmaps) > 1:
327 # This tag, or one of its parents, introduced a namespace
328 # mapping, so pop it off the stack.
329 out_of_scope_nsmap = self.nsmaps.pop()
331 if out_of_scope_nsmap is not None:
332 # This tag introduced a namespace mapping which is no
333 # longer in scope. Recalculate the currently active
334 # namespace prefixes.
335 self.active_namespace_prefixes.pop()
337 def pi(self, target, data):
338 self.soup.endData()
339 data = target + ' ' + data
340 self.soup.handle_data(data)
341 self.soup.endData(self.processing_instruction_class)
343 def data(self, content):
344 self.soup.handle_data(content)
346 def doctype(self, name, pubid, system):
347 self.soup.endData()
348 doctype = Doctype.for_name_and_ids(name, pubid, system)
349 self.soup.object_was_parsed(doctype)
351 def comment(self, content):
352 "Handle comments as Comment objects."
353 self.soup.endData()
354 self.soup.handle_data(content)
355 self.soup.endData(Comment)
357 def test_fragment_to_document(self, fragment):
358 """See `TreeBuilder`."""
359 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
362class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
364 NAME = LXML
365 ALTERNATE_NAMES = ["lxml-html"]
367 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
368 is_xml = False
369 processing_instruction_class = ProcessingInstruction
371 def default_parser(self, encoding):
372 return etree.HTMLParser
374 def feed(self, markup):
375 encoding = self.soup.original_encoding
376 try:
377 self.parser = self.parser_for(encoding)
378 self.parser.feed(markup)
379 self.parser.close()
380 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
381 raise ParserRejectedMarkup(e)
384 def test_fragment_to_document(self, fragment):
385 """See `TreeBuilder`."""
386 return '<html><body>%s</body></html>' % fragment