Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

205 statements  

1# encoding: utf-8 

2from __future__ import annotations 

3 

4# Use of this source code is governed by the MIT license. 

5__license__ = "MIT" 

6 

7__all__ = [ 

8 "LXMLTreeBuilderForXML", 

9 "LXMLTreeBuilder", 

10] 

11 

12 

13from typing import ( 

14 Any, 

15 Dict, 

16 Iterable, 

17 List, 

18 Optional, 

19 Set, 

20 Tuple, 

21 Type, 

22 TYPE_CHECKING, 

23 Union, 

24) 

25 

26from io import BytesIO 

27from io import StringIO 

28 

29from typing_extensions import TypeAlias 

30 

31from lxml import etree # type:ignore 

32from bs4.element import ( 

33 AttributeDict, 

34 XMLAttributeDict, 

35 Comment, 

36 Doctype, 

37 NamespacedAttribute, 

38 ProcessingInstruction, 

39 XMLProcessingInstruction, 

40) 

41from bs4.builder import ( 

42 DetectsXMLParsedAsHTML, 

43 FAST, 

44 HTML, 

45 HTMLTreeBuilder, 

46 PERMISSIVE, 

47 TreeBuilder, 

48 XML, 

49) 

50from bs4.dammit import EncodingDetector 

51from bs4.exceptions import ParserRejectedMarkup 

52 

53if TYPE_CHECKING: 

54 from bs4._typing import ( 

55 _Encoding, 

56 _Encodings, 

57 _NamespacePrefix, 

58 _NamespaceURL, 

59 _NamespaceMapping, 

60 _InvertedNamespaceMapping, 

61 _RawMarkup, 

62 ) 

63 from bs4 import BeautifulSoup 

64 

65LXML: str = "lxml" 

66 

67 

68def _invert(d: dict[Any, Any]) -> dict[Any, Any]: 

69 "Invert a dictionary." 

70 return dict((v, k) for k, v in list(d.items())) 

71 

72 

73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser] 

74_ParserOrParserClass: TypeAlias = Union[ 

75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser] 

76] 

77 

78 

79class LXMLTreeBuilderForXML(TreeBuilder): 

80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser 

81 

82 is_xml: bool = True 

83 

84 processing_instruction_class: Type[ProcessingInstruction] 

85 

86 NAME: str = "lxml-xml" 

87 ALTERNATE_NAMES: Iterable[str] = ["xml"] 

88 

89 # Well, it's permissive by XML parser standards. 

90 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE] 

91 

92 CHUNK_SIZE: int = 512 

93 

94 # This namespace mapping is specified in the XML Namespace 

95 # standard. 

96 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace") 

97 

98 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS) 

99 

100 nsmaps: List[Optional[_InvertedNamespaceMapping]] 

101 empty_element_tags: Optional[Set[str]] 

102 parser: Any 

103 _default_parser: Optional[etree.XMLParser] 

104 

105 # NOTE: If we parsed Element objects and looked at .sourceline, 

106 # we'd be able to see the line numbers from the original document. 

107 # But instead we build an XMLParser or HTMLParser object to serve 

108 # as the target of parse messages, and those messages don't include 

109 # line numbers. 

110 # See: https://bugs.launchpad.net/lxml/+bug/1846906 

111 

112 def initialize_soup(self, soup: BeautifulSoup) -> None: 

113 """Let the BeautifulSoup object know about the standard namespace 

114 mapping. 

115 

116 :param soup: A `BeautifulSoup`. 

117 """ 

118 # Beyond this point, self.soup is set, so we can assume (and 

119 # assert) it's not None whenever necessary. 

120 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) 

121 self._register_namespaces(self.DEFAULT_NSMAPS) 

122 

123 def _register_namespaces(self, mapping: Dict[str, str]) -> None: 

124 """Let the BeautifulSoup object know about namespaces encountered 

125 while parsing the document. 

126 

127 This might be useful later on when creating CSS selectors. 

128 

129 This will track (almost) all namespaces, even ones that were 

130 only in scope for part of the document. If two namespaces have 

131 the same prefix, only the first one encountered will be 

132 tracked. Un-prefixed namespaces are not tracked. 

133 

134 :param mapping: A dictionary mapping namespace prefixes to URIs. 

135 """ 

136 assert self.soup is not None 

137 for key, value in list(mapping.items()): 

138 # This is 'if key' and not 'if key is not None' because we 

139 # don't track un-prefixed namespaces. Soupselect will 

140 # treat an un-prefixed namespace as the default, which 

141 # causes confusion in some cases. 

142 if key and key not in self.soup._namespaces: 

143 # Let the BeautifulSoup object know about a new namespace. 

144 # If there are multiple namespaces defined with the same 

145 # prefix, the first one in the document takes precedence. 

146 self.soup._namespaces[key] = value 

147 

148 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

149 """Find the default parser for the given encoding. 

150 

151 :return: Either a parser object or a class, which 

152 will be instantiated with default arguments. 

153 """ 

154 if self._default_parser is not None: 

155 return self._default_parser 

156 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding) 

157 

158 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser: 

159 """Instantiate an appropriate parser for the given encoding. 

160 

161 :param encoding: A string. 

162 :return: A parser object such as an `etree.XMLParser`. 

163 """ 

164 # Use the default parser. 

165 parser = self.default_parser(encoding) 

166 

167 if callable(parser): 

168 # Instantiate the parser with default arguments 

169 parser = parser(target=self, recover=True, encoding=encoding) 

170 return parser 

171 

172 def __init__( 

173 self, 

174 parser: Optional[etree.XMLParser] = None, 

175 empty_element_tags: Optional[Set[str]] = None, 

176 **kwargs: Any, 

177 ): 

178 # TODO: Issue a warning if parser is present but not a 

179 # callable, since that means there's no way to create new 

180 # parsers for different encodings. 

181 self._default_parser = parser 

182 self.soup = None 

183 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

184 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] 

185 if self.is_xml: 

186 self.processing_instruction_class = XMLProcessingInstruction 

187 else: 

188 self.processing_instruction_class = ProcessingInstruction 

189 

190 if "attribute_dict_class" not in kwargs: 

191 kwargs["attribute_dict_class"] = XMLAttributeDict 

192 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) 

193 

194 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]: 

195 # Split the namespace URL out of a fully-qualified lxml tag 

196 # name. Copied from lxml's src/lxml/sax.py. 

197 if tag[0] == "{" and "}" in tag: 

198 namespace, name = tag[1:].split("}", 1) 

199 return (namespace, name) 

200 return (None, tag) 

201 

202 def prepare_markup( 

203 self, 

204 markup: _RawMarkup, 

205 user_specified_encoding: Optional[_Encoding] = None, 

206 document_declared_encoding: Optional[_Encoding] = None, 

207 exclude_encodings: Optional[_Encodings] = None, 

208 ) -> Iterable[ 

209 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool] 

210 ]: 

211 """Run any preliminary steps necessary to make incoming markup 

212 acceptable to the parser. 

213 

214 lxml really wants to get a bytestring and convert it to 

215 Unicode itself. So instead of using UnicodeDammit to convert 

216 the bytestring to Unicode using different encodings, this 

217 implementation uses EncodingDetector to iterate over the 

218 encodings, and tell lxml to try to parse the document as each 

219 one in turn. 

220 

221 :param markup: Some markup -- hopefully a bytestring. 

222 :param user_specified_encoding: The user asked to try this encoding. 

223 :param document_declared_encoding: The markup itself claims to be 

224 in this encoding. 

225 :param exclude_encodings: The user asked _not_ to try any of 

226 these encodings. 

227 

228 :yield: A series of 4-tuples: (markup, encoding, declared encoding, 

229 has undergone character replacement) 

230 

231 Each 4-tuple represents a strategy for converting the 

232 document to Unicode and parsing it. Each strategy will be tried 

233 in turn. 

234 """ 

235 if not self.is_xml: 

236 # We're in HTML mode, so if we're given XML, that's worth 

237 # noting. 

238 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) 

239 

240 if isinstance(markup, str): 

241 # We were given Unicode. Maybe lxml can parse Unicode on 

242 # this system? 

243 

244 # TODO: This is a workaround for 

245 # https://bugs.launchpad.net/lxml/+bug/1948551. 

246 # We can remove it once the upstream issue is fixed. 

247 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}": 

248 markup = markup[1:] 

249 yield markup, None, document_declared_encoding, False 

250 

251 if isinstance(markup, str): 

252 # No, apparently not. Convert the Unicode to UTF-8 and 

253 # tell lxml to parse it as UTF-8. 

254 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) 

255 

256 # Since the document was Unicode in the first place, there 

257 # is no need to try any more strategies; we know this will 

258 # work. 

259 return 

260 

261 known_definite_encodings: List[_Encoding] = [] 

262 if user_specified_encoding: 

263 # This was provided by the end-user; treat it as a known 

264 # definite encoding per the algorithm laid out in the 

265 # HTML5 spec. (See the EncodingDetector class for 

266 # details.) 

267 known_definite_encodings.append(user_specified_encoding) 

268 

269 user_encodings: List[_Encoding] = [] 

270 if document_declared_encoding: 

271 # This was found in the document; treat it as a slightly 

272 # lower-priority user encoding. 

273 user_encodings.append(document_declared_encoding) 

274 

275 detector = EncodingDetector( 

276 markup, 

277 known_definite_encodings=known_definite_encodings, 

278 user_encodings=user_encodings, 

279 is_html=not self.is_xml, 

280 exclude_encodings=exclude_encodings, 

281 ) 

282 for encoding in detector.encodings: 

283 yield (detector.markup, encoding, document_declared_encoding, False) 

284 

285 def feed(self, markup: _RawMarkup) -> None: 

286 io: Union[BytesIO, StringIO] 

287 if isinstance(markup, bytes): 

288 io = BytesIO(markup) 

289 elif isinstance(markup, str): 

290 io = StringIO(markup) 

291 

292 # initialize_soup is called before feed, so we know this 

293 # is not None. 

294 assert self.soup is not None 

295 

296 # Call feed() at least once, even if the markup is empty, 

297 # or the parser won't be initialized. 

298 data = io.read(self.CHUNK_SIZE) 

299 try: 

300 self.parser = self.parser_for(self.soup.original_encoding) 

301 self.parser.feed(data) 

302 while len(data) != 0: 

303 # Now call feed() on the rest of the data, chunk by chunk. 

304 data = io.read(self.CHUNK_SIZE) 

305 if len(data) != 0: 

306 self.parser.feed(data) 

307 self.parser.close() 

308 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

309 raise ParserRejectedMarkup(e) 

310 

311 def close(self) -> None: 

312 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

313 

314 def start( 

315 self, 

316 tag: str | bytes, 

317 attrib: Dict[str | bytes, str | bytes], 

318 nsmap: _NamespaceMapping = {}, 

319 ) -> None: 

320 # This is called by lxml code as a result of calling 

321 # BeautifulSoup.feed(), and we know self.soup is set by the time feed() 

322 # is called. 

323 assert self.soup is not None 

324 assert isinstance(tag, str) 

325 

326 # We need to recreate the attribute dict for three 

327 # reasons. First, for type checking, so we can assert there 

328 # are no bytestrings in the keys or values. Second, because we 

329 # need a mutable dict--lxml might send us an immutable 

330 # dictproxy. Third, so we can handle namespaced attribute 

331 # names by converting the keys to NamespacedAttributes. 

332 new_attrib: Dict[Union[str, NamespacedAttribute], str] = ( 

333 self.attribute_dict_class() 

334 ) 

335 for k, v in attrib.items(): 

336 assert isinstance(k, str) 

337 assert isinstance(v, str) 

338 new_attrib[k] = v 

339 

340 nsprefix: Optional[_NamespacePrefix] = None 

341 namespace: Optional[_NamespaceURL] = None 

342 # Invert each namespace map as it comes in. 

343 if len(nsmap) == 0 and len(self.nsmaps) > 1: 

344 # There are no new namespaces for this tag, but 

345 # non-default namespaces are in play, so we need a 

346 # separate tag stack to know when they end. 

347 self.nsmaps.append(None) 

348 elif len(nsmap) > 0: 

349 # A new namespace mapping has come into play. 

350 

351 # First, Let the BeautifulSoup object know about it. 

352 self._register_namespaces(nsmap) 

353 

354 # Then, add it to our running list of inverted namespace 

355 # mappings. 

356 self.nsmaps.append(_invert(nsmap)) 

357 

358 # The currently active namespace prefixes have 

359 # changed. Calculate the new mapping so it can be stored 

360 # with all Tag objects created while these prefixes are in 

361 # scope. 

362 current_mapping = dict(self.active_namespace_prefixes[-1]) 

363 current_mapping.update(nsmap) 

364 

365 # We should not track un-prefixed namespaces as we can only hold one 

366 # and it will be recognized as the default namespace by soupsieve, 

367 # which may be confusing in some situations. 

368 if "" in current_mapping: 

369 del current_mapping[""] 

370 self.active_namespace_prefixes.append(current_mapping) 

371 

372 # Also treat the namespace mapping as a set of attributes on the 

373 # tag, so we can recreate it later. 

374 for prefix, namespace in list(nsmap.items()): 

375 attribute = NamespacedAttribute( 

376 "xmlns", prefix, "http://www.w3.org/2000/xmlns/" 

377 ) 

378 new_attrib[attribute] = namespace 

379 

380 # Namespaces are in play. Find any attributes that came in 

381 # from lxml with namespaces attached to their names, and 

382 # turn then into NamespacedAttribute objects. 

383 final_attrib: AttributeDict = self.attribute_dict_class() 

384 for attr, value in list(new_attrib.items()): 

385 namespace, attr = self._getNsTag(attr) 

386 if namespace is None: 

387 final_attrib[attr] = value 

388 else: 

389 nsprefix = self._prefix_for_namespace(namespace) 

390 attr = NamespacedAttribute(nsprefix, attr, namespace) 

391 final_attrib[attr] = value 

392 

393 namespace, tag = self._getNsTag(tag) 

394 nsprefix = self._prefix_for_namespace(namespace) 

395 self.soup.handle_starttag( 

396 tag, 

397 namespace, 

398 nsprefix, 

399 final_attrib, 

400 namespaces=self.active_namespace_prefixes[-1], 

401 ) 

402 

403 def _prefix_for_namespace( 

404 self, namespace: Optional[_NamespaceURL] 

405 ) -> Optional[_NamespacePrefix]: 

406 """Find the currently active prefix for the given namespace.""" 

407 if namespace is None: 

408 return None 

409 for inverted_nsmap in reversed(self.nsmaps): 

410 if inverted_nsmap is not None and namespace in inverted_nsmap: 

411 return inverted_nsmap[namespace] 

412 return None 

413 

414 def end(self, tag: str | bytes) -> None: 

415 assert self.soup is not None 

416 assert isinstance(tag, str) 

417 self.soup.endData() 

418 namespace, tag = self._getNsTag(tag) 

419 nsprefix = None 

420 if namespace is not None: 

421 for inverted_nsmap in reversed(self.nsmaps): 

422 if inverted_nsmap is not None and namespace in inverted_nsmap: 

423 nsprefix = inverted_nsmap[namespace] 

424 break 

425 self.soup.handle_endtag(tag, nsprefix) 

426 if len(self.nsmaps) > 1: 

427 # This tag, or one of its parents, introduced a namespace 

428 # mapping, so pop it off the stack. 

429 out_of_scope_nsmap = self.nsmaps.pop() 

430 

431 if out_of_scope_nsmap is not None: 

432 # This tag introduced a namespace mapping which is no 

433 # longer in scope. Recalculate the currently active 

434 # namespace prefixes. 

435 self.active_namespace_prefixes.pop() 

436 

437 def pi(self, target: str, data: str) -> None: 

438 assert self.soup is not None 

439 self.soup.endData() 

440 data = target + " " + data 

441 self.soup.handle_data(data) 

442 self.soup.endData(self.processing_instruction_class) 

443 

444 def data(self, data: str | bytes) -> None: 

445 assert self.soup is not None 

446 assert isinstance(data, str) 

447 self.soup.handle_data(data) 

448 

449 def doctype(self, name: str, pubid: str, system: str) -> None: 

450 assert self.soup is not None 

451 self.soup.endData() 

452 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system) 

453 self.soup.handle_data(doctype_string) 

454 self.soup.endData(containerClass=Doctype) 

455 

456 def comment(self, text: str | bytes) -> None: 

457 "Handle comments as Comment objects." 

458 assert self.soup is not None 

459 assert isinstance(text, str) 

460 self.soup.endData() 

461 self.soup.handle_data(text) 

462 self.soup.endData(Comment) 

463 

464 def test_fragment_to_document(self, fragment: str) -> str: 

465 """See `TreeBuilder`.""" 

466 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 

467 

468 

469class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 

470 NAME: str = LXML 

471 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"] 

472 

473 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE] 

474 is_xml: bool = False 

475 

476 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

477 return etree.HTMLParser 

478 

479 def feed(self, markup: _RawMarkup) -> None: 

480 # We know self.soup is set by the time feed() is called. 

481 assert self.soup is not None 

482 encoding = self.soup.original_encoding 

483 try: 

484 self.parser = self.parser_for(encoding) 

485 self.parser.feed(markup) 

486 self.parser.close() 

487 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

488 raise ParserRejectedMarkup(e) 

489 

490 def test_fragment_to_document(self, fragment: str) -> str: 

491 """See `TreeBuilder`.""" 

492 return "<html><body>%s</body></html>" % fragment