Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_lxml.py: 4%

174 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4__all__ = [ 

5 'LXMLTreeBuilderForXML', 

6 'LXMLTreeBuilder', 

7 ] 

8 

9try: 

10 from collections.abc import Callable # Python 3.6 

11except ImportError as e: 

12 from collections import Callable 

13 

14from io import BytesIO 

15from io import StringIO 

16from lxml import etree 

17from bs4.element import ( 

18 Comment, 

19 Doctype, 

20 NamespacedAttribute, 

21 ProcessingInstruction, 

22 XMLProcessingInstruction, 

23) 

24from bs4.builder import ( 

25 DetectsXMLParsedAsHTML, 

26 FAST, 

27 HTML, 

28 HTMLTreeBuilder, 

29 PERMISSIVE, 

30 ParserRejectedMarkup, 

31 TreeBuilder, 

32 XML) 

33from bs4.dammit import EncodingDetector 

34 

35LXML = 'lxml' 

36 

37def _invert(d): 

38 "Invert a dictionary." 

39 return dict((v,k) for k, v in list(d.items())) 

40 

41class LXMLTreeBuilderForXML(TreeBuilder): 

42 DEFAULT_PARSER_CLASS = etree.XMLParser 

43 

44 is_xml = True 

45 processing_instruction_class = XMLProcessingInstruction 

46 

47 NAME = "lxml-xml" 

48 ALTERNATE_NAMES = ["xml"] 

49 

50 # Well, it's permissive by XML parser standards. 

51 features = [NAME, LXML, XML, FAST, PERMISSIVE] 

52 

53 CHUNK_SIZE = 512 

54 

55 # This namespace mapping is specified in the XML Namespace 

56 # standard. 

57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') 

58 

59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) 

60 

61 # NOTE: If we parsed Element objects and looked at .sourceline, 

62 # we'd be able to see the line numbers from the original document. 

63 # But instead we build an XMLParser or HTMLParser object to serve 

64 # as the target of parse messages, and those messages don't include 

65 # line numbers. 

66 # See: https://bugs.launchpad.net/lxml/+bug/1846906 

67 

68 def initialize_soup(self, soup): 

69 """Let the BeautifulSoup object know about the standard namespace 

70 mapping. 

71 

72 :param soup: A `BeautifulSoup`. 

73 """ 

74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) 

75 self._register_namespaces(self.DEFAULT_NSMAPS) 

76 

77 def _register_namespaces(self, mapping): 

78 """Let the BeautifulSoup object know about namespaces encountered 

79 while parsing the document. 

80 

81 This might be useful later on when creating CSS selectors. 

82 

83 This will track (almost) all namespaces, even ones that were 

84 only in scope for part of the document. If two namespaces have 

85 the same prefix, only the first one encountered will be 

86 tracked. Un-prefixed namespaces are not tracked. 

87 

88 :param mapping: A dictionary mapping namespace prefixes to URIs. 

89 """ 

90 for key, value in list(mapping.items()): 

91 # This is 'if key' and not 'if key is not None' because we 

92 # don't track un-prefixed namespaces. Soupselect will 

93 # treat an un-prefixed namespace as the default, which 

94 # causes confusion in some cases. 

95 if key and key not in self.soup._namespaces: 

96 # Let the BeautifulSoup object know about a new namespace. 

97 # If there are multiple namespaces defined with the same 

98 # prefix, the first one in the document takes precedence. 

99 self.soup._namespaces[key] = value 

100 

101 def default_parser(self, encoding): 

102 """Find the default parser for the given encoding. 

103 

104 :param encoding: A string. 

105 :return: Either a parser object or a class, which 

106 will be instantiated with default arguments. 

107 """ 

108 if self._default_parser is not None: 

109 return self._default_parser 

110 return etree.XMLParser( 

111 target=self, strip_cdata=False, recover=True, encoding=encoding) 

112 

113 def parser_for(self, encoding): 

114 """Instantiate an appropriate parser for the given encoding. 

115 

116 :param encoding: A string. 

117 :return: A parser object such as an `etree.XMLParser`. 

118 """ 

119 # Use the default parser. 

120 parser = self.default_parser(encoding) 

121 

122 if isinstance(parser, Callable): 

123 # Instantiate the parser with default arguments 

124 parser = parser( 

125 target=self, strip_cdata=False, recover=True, encoding=encoding 

126 ) 

127 return parser 

128 

129 def __init__(self, parser=None, empty_element_tags=None, **kwargs): 

130 # TODO: Issue a warning if parser is present but not a 

131 # callable, since that means there's no way to create new 

132 # parsers for different encodings. 

133 self._default_parser = parser 

134 if empty_element_tags is not None: 

135 self.empty_element_tags = set(empty_element_tags) 

136 self.soup = None 

137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] 

139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) 

140 

141 def _getNsTag(self, tag): 

142 # Split the namespace URL out of a fully-qualified lxml tag 

143 # name. Copied from lxml's src/lxml/sax.py. 

144 if tag[0] == '{': 

145 return tuple(tag[1:].split('}', 1)) 

146 else: 

147 return (None, tag) 

148 

149 def prepare_markup(self, markup, user_specified_encoding=None, 

150 exclude_encodings=None, 

151 document_declared_encoding=None): 

152 """Run any preliminary steps necessary to make incoming markup 

153 acceptable to the parser. 

154 

155 lxml really wants to get a bytestring and convert it to 

156 Unicode itself. So instead of using UnicodeDammit to convert 

157 the bytestring to Unicode using different encodings, this 

158 implementation uses EncodingDetector to iterate over the 

159 encodings, and tell lxml to try to parse the document as each 

160 one in turn. 

161 

162 :param markup: Some markup -- hopefully a bytestring. 

163 :param user_specified_encoding: The user asked to try this encoding. 

164 :param document_declared_encoding: The markup itself claims to be 

165 in this encoding. 

166 :param exclude_encodings: The user asked _not_ to try any of 

167 these encodings. 

168 

169 :yield: A series of 4-tuples: 

170 (markup, encoding, declared encoding, 

171 has undergone character replacement) 

172 

173 Each 4-tuple represents a strategy for converting the 

174 document to Unicode and parsing it. Each strategy will be tried  

175 in turn. 

176 """ 

177 is_html = not self.is_xml 

178 if is_html: 

179 self.processing_instruction_class = ProcessingInstruction 

180 # We're in HTML mode, so if we're given XML, that's worth 

181 # noting. 

182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup) 

183 else: 

184 self.processing_instruction_class = XMLProcessingInstruction 

185 

186 if isinstance(markup, str): 

187 # We were given Unicode. Maybe lxml can parse Unicode on 

188 # this system? 

189 

190 # TODO: This is a workaround for 

191 # https://bugs.launchpad.net/lxml/+bug/1948551. 

192 # We can remove it once the upstream issue is fixed. 

193 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': 

194 markup = markup[1:] 

195 yield markup, None, document_declared_encoding, False 

196 

197 if isinstance(markup, str): 

198 # No, apparently not. Convert the Unicode to UTF-8 and 

199 # tell lxml to parse it as UTF-8. 

200 yield (markup.encode("utf8"), "utf8", 

201 document_declared_encoding, False) 

202 

203 # This was provided by the end-user; treat it as a known 

204 # definite encoding per the algorithm laid out in the HTML5 

205 # spec. (See the EncodingDetector class for details.) 

206 known_definite_encodings = [user_specified_encoding] 

207 

208 # This was found in the document; treat it as a slightly lower-priority 

209 # user encoding. 

210 user_encodings = [document_declared_encoding] 

211 detector = EncodingDetector( 

212 markup, known_definite_encodings=known_definite_encodings, 

213 user_encodings=user_encodings, is_html=is_html, 

214 exclude_encodings=exclude_encodings 

215 ) 

216 for encoding in detector.encodings: 

217 yield (detector.markup, encoding, document_declared_encoding, False) 

218 

219 def feed(self, markup): 

220 if isinstance(markup, bytes): 

221 markup = BytesIO(markup) 

222 elif isinstance(markup, str): 

223 markup = StringIO(markup) 

224 

225 # Call feed() at least once, even if the markup is empty, 

226 # or the parser won't be initialized. 

227 data = markup.read(self.CHUNK_SIZE) 

228 try: 

229 self.parser = self.parser_for(self.soup.original_encoding) 

230 self.parser.feed(data) 

231 while len(data) != 0: 

232 # Now call feed() on the rest of the data, chunk by chunk. 

233 data = markup.read(self.CHUNK_SIZE) 

234 if len(data) != 0: 

235 self.parser.feed(data) 

236 self.parser.close() 

237 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

238 raise ParserRejectedMarkup(e) 

239 

240 def close(self): 

241 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

242 

243 def start(self, name, attrs, nsmap={}): 

244 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 

245 attrs = dict(attrs) 

246 nsprefix = None 

247 # Invert each namespace map as it comes in. 

248 if len(nsmap) == 0 and len(self.nsmaps) > 1: 

249 # There are no new namespaces for this tag, but 

250 # non-default namespaces are in play, so we need a 

251 # separate tag stack to know when they end. 

252 self.nsmaps.append(None) 

253 elif len(nsmap) > 0: 

254 # A new namespace mapping has come into play. 

255 

256 # First, Let the BeautifulSoup object know about it. 

257 self._register_namespaces(nsmap) 

258 

259 # Then, add it to our running list of inverted namespace 

260 # mappings. 

261 self.nsmaps.append(_invert(nsmap)) 

262 

263 # The currently active namespace prefixes have 

264 # changed. Calculate the new mapping so it can be stored 

265 # with all Tag objects created while these prefixes are in 

266 # scope. 

267 current_mapping = dict(self.active_namespace_prefixes[-1]) 

268 current_mapping.update(nsmap) 

269 

270 # We should not track un-prefixed namespaces as we can only hold one 

271 # and it will be recognized as the default namespace by soupsieve, 

272 # which may be confusing in some situations. 

273 if '' in current_mapping: 

274 del current_mapping[''] 

275 self.active_namespace_prefixes.append(current_mapping) 

276 

277 # Also treat the namespace mapping as a set of attributes on the 

278 # tag, so we can recreate it later. 

279 attrs = attrs.copy() 

280 for prefix, namespace in list(nsmap.items()): 

281 attribute = NamespacedAttribute( 

282 "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 

283 attrs[attribute] = namespace 

284 

285 # Namespaces are in play. Find any attributes that came in 

286 # from lxml with namespaces attached to their names, and 

287 # turn then into NamespacedAttribute objects. 

288 new_attrs = {} 

289 for attr, value in list(attrs.items()): 

290 namespace, attr = self._getNsTag(attr) 

291 if namespace is None: 

292 new_attrs[attr] = value 

293 else: 

294 nsprefix = self._prefix_for_namespace(namespace) 

295 attr = NamespacedAttribute(nsprefix, attr, namespace) 

296 new_attrs[attr] = value 

297 attrs = new_attrs 

298 

299 namespace, name = self._getNsTag(name) 

300 nsprefix = self._prefix_for_namespace(namespace) 

301 self.soup.handle_starttag( 

302 name, namespace, nsprefix, attrs, 

303 namespaces=self.active_namespace_prefixes[-1] 

304 ) 

305 

306 def _prefix_for_namespace(self, namespace): 

307 """Find the currently active prefix for the given namespace.""" 

308 if namespace is None: 

309 return None 

310 for inverted_nsmap in reversed(self.nsmaps): 

311 if inverted_nsmap is not None and namespace in inverted_nsmap: 

312 return inverted_nsmap[namespace] 

313 return None 

314 

315 def end(self, name): 

316 self.soup.endData() 

317 completed_tag = self.soup.tagStack[-1] 

318 namespace, name = self._getNsTag(name) 

319 nsprefix = None 

320 if namespace is not None: 

321 for inverted_nsmap in reversed(self.nsmaps): 

322 if inverted_nsmap is not None and namespace in inverted_nsmap: 

323 nsprefix = inverted_nsmap[namespace] 

324 break 

325 self.soup.handle_endtag(name, nsprefix) 

326 if len(self.nsmaps) > 1: 

327 # This tag, or one of its parents, introduced a namespace 

328 # mapping, so pop it off the stack. 

329 out_of_scope_nsmap = self.nsmaps.pop() 

330 

331 if out_of_scope_nsmap is not None: 

332 # This tag introduced a namespace mapping which is no 

333 # longer in scope. Recalculate the currently active 

334 # namespace prefixes. 

335 self.active_namespace_prefixes.pop() 

336 

337 def pi(self, target, data): 

338 self.soup.endData() 

339 data = target + ' ' + data 

340 self.soup.handle_data(data) 

341 self.soup.endData(self.processing_instruction_class) 

342 

343 def data(self, content): 

344 self.soup.handle_data(content) 

345 

346 def doctype(self, name, pubid, system): 

347 self.soup.endData() 

348 doctype = Doctype.for_name_and_ids(name, pubid, system) 

349 self.soup.object_was_parsed(doctype) 

350 

351 def comment(self, content): 

352 "Handle comments as Comment objects." 

353 self.soup.endData() 

354 self.soup.handle_data(content) 

355 self.soup.endData(Comment) 

356 

357 def test_fragment_to_document(self, fragment): 

358 """See `TreeBuilder`.""" 

359 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 

360 

361 

362class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 

363 

364 NAME = LXML 

365 ALTERNATE_NAMES = ["lxml-html"] 

366 

367 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 

368 is_xml = False 

369 processing_instruction_class = ProcessingInstruction 

370 

371 def default_parser(self, encoding): 

372 return etree.HTMLParser 

373 

374 def feed(self, markup): 

375 encoding = self.soup.original_encoding 

376 try: 

377 self.parser = self.parser_for(encoding) 

378 self.parser.feed(markup) 

379 self.parser.close() 

380 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

381 raise ParserRejectedMarkup(e) 

382 

383 

384 def test_fragment_to_document(self, fragment): 

385 """See `TreeBuilder`.""" 

386 return '<html><body>%s</body></html>' % fragment