Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

207 statements  

1# encoding: utf-8 

2from __future__ import annotations 

3 

4# Use of this source code is governed by the MIT license. 

5__license__ = "MIT" 

6 

7__all__ = [ 

8 "LXMLTreeBuilderForXML", 

9 "LXMLTreeBuilder", 

10] 

11 

12 

13from typing import ( 

14 Any, 

15 Dict, 

16 Iterable, 

17 List, 

18 Optional, 

19 Set, 

20 Tuple, 

21 Type, 

22 TYPE_CHECKING, 

23 Union, 

24) 

25 

26from io import BytesIO 

27from io import StringIO 

28 

29from typing_extensions import TypeAlias 

30 

31from lxml import etree # type:ignore 

32from bs4.element import ( 

33 AttributeDict, 

34 XMLAttributeDict, 

35 Comment, 

36 Doctype, 

37 NamespacedAttribute, 

38 ProcessingInstruction, 

39 XMLProcessingInstruction, 

40) 

41from bs4.builder import ( 

42 DetectsXMLParsedAsHTML, 

43 FAST, 

44 HTML, 

45 HTMLTreeBuilder, 

46 PERMISSIVE, 

47 TreeBuilder, 

48 XML, 

49) 

50from bs4.dammit import EncodingDetector 

51from bs4.exceptions import ParserRejectedMarkup 

52 

53if TYPE_CHECKING: 

54 from bs4._typing import ( 

55 _Encoding, 

56 _Encodings, 

57 _NamespacePrefix, 

58 _NamespaceURL, 

59 _NamespaceMapping, 

60 _InvertedNamespaceMapping, 

61 _RawMarkup, 

62 ) 

63 from bs4 import BeautifulSoup 

64 

65LXML: str = "lxml" 

66 

67 

68def _invert(d: dict[Any, Any]) -> dict[Any, Any]: 

69 "Invert a dictionary." 

70 return dict((v, k) for k, v in list(d.items())) 

71 

72 

73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser] 

74_ParserOrParserClass: TypeAlias = Union[ 

75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser] 

76] 

77 

78 

79class LXMLTreeBuilderForXML(TreeBuilder): 

80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser 

81 

82 is_xml: bool = True 

83 

84 #: Set this to true (probably by passing huge_tree=True into the : 

85 #: BeautifulSoup constructor) to enable the lxml feature "disable security 

86 #: restrictions and support very deep trees and very long text 

87 #: content". 

88 huge_tree: bool 

89 

90 processing_instruction_class: Type[ProcessingInstruction] 

91 

92 NAME: str = "lxml-xml" 

93 ALTERNATE_NAMES: Iterable[str] = ["xml"] 

94 

95 # Well, it's permissive by XML parser standards. 

96 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE] 

97 

98 CHUNK_SIZE: int = 512 

99 

100 # This namespace mapping is specified in the XML Namespace 

101 # standard. 

102 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace") 

103 

104 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS) 

105 

106 nsmaps: List[Optional[_InvertedNamespaceMapping]] 

107 empty_element_tags: Optional[Set[str]] 

108 parser: Any 

109 _default_parser: Optional[etree.XMLParser] 

110 

111 # NOTE: If we parsed Element objects and looked at .sourceline, 

112 # we'd be able to see the line numbers from the original document. 

113 # But instead we build an XMLParser or HTMLParser object to serve 

114 # as the target of parse messages, and those messages don't include 

115 # line numbers. 

116 # See: https://bugs.launchpad.net/lxml/+bug/1846906 

117 

118 def initialize_soup(self, soup: BeautifulSoup) -> None: 

119 """Let the BeautifulSoup object know about the standard namespace 

120 mapping. 

121 

122 :param soup: A `BeautifulSoup`. 

123 """ 

124 # Beyond this point, self.soup is set, so we can assume (and 

125 # assert) it's not None whenever necessary. 

126 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) 

127 self._register_namespaces(self.DEFAULT_NSMAPS) 

128 

129 def _register_namespaces(self, mapping: Dict[str, str]) -> None: 

130 """Let the BeautifulSoup object know about namespaces encountered 

131 while parsing the document. 

132 

133 This might be useful later on when creating CSS selectors. 

134 

135 This will track (almost) all namespaces, even ones that were 

136 only in scope for part of the document. If two namespaces have 

137 the same prefix, only the first one encountered will be 

138 tracked. Un-prefixed namespaces are not tracked. 

139 

140 :param mapping: A dictionary mapping namespace prefixes to URIs. 

141 """ 

142 assert self.soup is not None 

143 for key, value in list(mapping.items()): 

144 # This is 'if key' and not 'if key is not None' because we 

145 # don't track un-prefixed namespaces. Soupselect will 

146 # treat an un-prefixed namespace as the default, which 

147 # causes confusion in some cases. 

148 if key and key not in self.soup._namespaces: 

149 # Let the BeautifulSoup object know about a new namespace. 

150 # If there are multiple namespaces defined with the same 

151 # prefix, the first one in the document takes precedence. 

152 self.soup._namespaces[key] = value 

153 

154 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

155 """Find the default parser for the given encoding. 

156 

157 :return: Either a parser object or a class, which 

158 will be instantiated with default arguments. 

159 """ 

160 if self._default_parser is not None: 

161 return self._default_parser 

162 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding) 

163 

164 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser: 

165 """Instantiate an appropriate parser for the given encoding. 

166 

167 :param encoding: A string. 

168 :return: A parser object such as an `etree.XMLParser`. 

169 """ 

170 # Use the default parser. 

171 parser = self.default_parser(encoding) 

172 

173 if callable(parser): 

174 # Instantiate the parser with default arguments 

175 parser = parser(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding) 

176 return parser 

177 

178 def __init__( 

179 self, 

180 parser: Optional[etree.XMLParser] = None, 

181 empty_element_tags: Optional[Set[str]] = None, 

182 huge_tree: bool = False, 

183 **kwargs: Any, 

184 ): 

185 # TODO: Issue a warning if parser is present but not a 

186 # callable, since that means there's no way to create new 

187 # parsers for different encodings. 

188 self._default_parser = parser 

189 self.soup = None 

190 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

191 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] 

192 if self.is_xml: 

193 self.processing_instruction_class = XMLProcessingInstruction 

194 else: 

195 self.processing_instruction_class = ProcessingInstruction 

196 

197 if "attribute_dict_class" not in kwargs: 

198 kwargs["attribute_dict_class"] = XMLAttributeDict 

199 self.huge_tree = huge_tree 

200 

201 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) 

202 

203 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]: 

204 # Split the namespace URL out of a fully-qualified lxml tag 

205 # name. Copied from lxml's src/lxml/sax.py. 

206 if tag[0] == "{" and "}" in tag: 

207 namespace, name = tag[1:].split("}", 1) 

208 return (namespace, name) 

209 return (None, tag) 

210 

211 def prepare_markup( 

212 self, 

213 markup: _RawMarkup, 

214 user_specified_encoding: Optional[_Encoding] = None, 

215 document_declared_encoding: Optional[_Encoding] = None, 

216 exclude_encodings: Optional[_Encodings] = None, 

217 ) -> Iterable[ 

218 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool] 

219 ]: 

220 """Run any preliminary steps necessary to make incoming markup 

221 acceptable to the parser. 

222 

223 lxml really wants to get a bytestring and convert it to 

224 Unicode itself. So instead of using UnicodeDammit to convert 

225 the bytestring to Unicode using different encodings, this 

226 implementation uses EncodingDetector to iterate over the 

227 encodings, and tell lxml to try to parse the document as each 

228 one in turn. 

229 

230 :param markup: Some markup -- hopefully a bytestring. 

231 :param user_specified_encoding: The user asked to try this encoding. 

232 :param document_declared_encoding: The markup itself claims to be 

233 in this encoding. 

234 :param exclude_encodings: The user asked _not_ to try any of 

235 these encodings. 

236 

237 :yield: A series of 4-tuples: (markup, encoding, declared encoding, 

238 has undergone character replacement) 

239 

240 Each 4-tuple represents a strategy for converting the 

241 document to Unicode and parsing it. Each strategy will be tried 

242 in turn. 

243 """ 

244 if not self.is_xml: 

245 # We're in HTML mode, so if we're given XML, that's worth 

246 # noting. 

247 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) 

248 

249 if isinstance(markup, str): 

250 # We were given Unicode. Maybe lxml can parse Unicode on 

251 # this system? 

252 

253 # TODO: This is a workaround for 

254 # https://bugs.launchpad.net/lxml/+bug/1948551. 

255 # We can remove it once the upstream issue is fixed. 

256 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}": 

257 markup = markup[1:] 

258 yield markup, None, document_declared_encoding, False 

259 

260 if isinstance(markup, str): 

261 # No, apparently not. Convert the Unicode to UTF-8 and 

262 # tell lxml to parse it as UTF-8. 

263 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) 

264 

265 # Since the document was Unicode in the first place, there 

266 # is no need to try any more strategies; we know this will 

267 # work. 

268 return 

269 

270 known_definite_encodings: List[_Encoding] = [] 

271 if user_specified_encoding: 

272 # This was provided by the end-user; treat it as a known 

273 # definite encoding per the algorithm laid out in the 

274 # HTML5 spec. (See the EncodingDetector class for 

275 # details.) 

276 known_definite_encodings.append(user_specified_encoding) 

277 

278 user_encodings: List[_Encoding] = [] 

279 if document_declared_encoding: 

280 # This was found in the document; treat it as a slightly 

281 # lower-priority user encoding. 

282 user_encodings.append(document_declared_encoding) 

283 

284 detector = EncodingDetector( 

285 markup, 

286 known_definite_encodings=known_definite_encodings, 

287 user_encodings=user_encodings, 

288 is_html=not self.is_xml, 

289 exclude_encodings=exclude_encodings, 

290 ) 

291 for encoding in detector.encodings: 

292 yield (detector.markup, encoding, document_declared_encoding, False) 

293 

294 def feed(self, markup: _RawMarkup) -> None: 

295 io: Union[BytesIO, StringIO] 

296 if isinstance(markup, bytes): 

297 io = BytesIO(markup) 

298 elif isinstance(markup, str): 

299 io = StringIO(markup) 

300 

301 # initialize_soup is called before feed, so we know this 

302 # is not None. 

303 assert self.soup is not None 

304 

305 # Call feed() at least once, even if the markup is empty, 

306 # or the parser won't be initialized. 

307 data = io.read(self.CHUNK_SIZE) 

308 try: 

309 self.parser = self.parser_for(self.soup.original_encoding) 

310 self.parser.feed(data) 

311 while len(data) != 0: 

312 # Now call feed() on the rest of the data, chunk by chunk. 

313 data = io.read(self.CHUNK_SIZE) 

314 if len(data) != 0: 

315 self.parser.feed(data) 

316 self.parser.close() 

317 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

318 raise ParserRejectedMarkup(e) 

319 

320 def close(self) -> None: 

321 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

322 

323 def start( 

324 self, 

325 tag: str | bytes, 

326 attrib: Dict[str | bytes, str | bytes], 

327 nsmap: _NamespaceMapping = {}, 

328 ) -> None: 

329 # This is called by lxml code as a result of calling 

330 # BeautifulSoup.feed(), and we know self.soup is set by the time feed() 

331 # is called. 

332 assert self.soup is not None 

333 assert isinstance(tag, str) 

334 

335 # We need to recreate the attribute dict for three 

336 # reasons. First, for type checking, so we can assert there 

337 # are no bytestrings in the keys or values. Second, because we 

338 # need a mutable dict--lxml might send us an immutable 

339 # dictproxy. Third, so we can handle namespaced attribute 

340 # names by converting the keys to NamespacedAttributes. 

341 new_attrib: Dict[Union[str, NamespacedAttribute], str] = ( 

342 self.attribute_dict_class() 

343 ) 

344 for k, v in attrib.items(): 

345 assert isinstance(k, str) 

346 assert isinstance(v, str) 

347 new_attrib[k] = v 

348 

349 nsprefix: Optional[_NamespacePrefix] = None 

350 namespace: Optional[_NamespaceURL] = None 

351 # Invert each namespace map as it comes in. 

352 if len(nsmap) == 0 and len(self.nsmaps) > 1: 

353 # There are no new namespaces for this tag, but 

354 # non-default namespaces are in play, so we need a 

355 # separate tag stack to know when they end. 

356 self.nsmaps.append(None) 

357 elif len(nsmap) > 0: 

358 # A new namespace mapping has come into play. 

359 

360 # First, Let the BeautifulSoup object know about it. 

361 self._register_namespaces(nsmap) 

362 

363 # Then, add it to our running list of inverted namespace 

364 # mappings. 

365 self.nsmaps.append(_invert(nsmap)) 

366 

367 # The currently active namespace prefixes have 

368 # changed. Calculate the new mapping so it can be stored 

369 # with all Tag objects created while these prefixes are in 

370 # scope. 

371 current_mapping = dict(self.active_namespace_prefixes[-1]) 

372 current_mapping.update(nsmap) 

373 

374 # We should not track un-prefixed namespaces as we can only hold one 

375 # and it will be recognized as the default namespace by soupsieve, 

376 # which may be confusing in some situations. 

377 if "" in current_mapping: 

378 del current_mapping[""] 

379 self.active_namespace_prefixes.append(current_mapping) 

380 

381 # Also treat the namespace mapping as a set of attributes on the 

382 # tag, so we can recreate it later. 

383 for prefix, namespace in list(nsmap.items()): 

384 attribute = NamespacedAttribute( 

385 "xmlns", prefix, "http://www.w3.org/2000/xmlns/" 

386 ) 

387 new_attrib[attribute] = namespace 

388 

389 # Namespaces are in play. Find any attributes that came in 

390 # from lxml with namespaces attached to their names, and 

391 # turn then into NamespacedAttribute objects. 

392 final_attrib: AttributeDict = self.attribute_dict_class() 

393 for attr, value in list(new_attrib.items()): 

394 namespace, attr = self._getNsTag(attr) 

395 if namespace is None: 

396 final_attrib[attr] = value 

397 else: 

398 nsprefix = self._prefix_for_namespace(namespace) 

399 attr = NamespacedAttribute(nsprefix, attr, namespace) 

400 final_attrib[attr] = value 

401 

402 namespace, tag = self._getNsTag(tag) 

403 nsprefix = self._prefix_for_namespace(namespace) 

404 self.soup.handle_starttag( 

405 tag, 

406 namespace, 

407 nsprefix, 

408 final_attrib, 

409 namespaces=self.active_namespace_prefixes[-1], 

410 ) 

411 

412 def _prefix_for_namespace( 

413 self, namespace: Optional[_NamespaceURL] 

414 ) -> Optional[_NamespacePrefix]: 

415 """Find the currently active prefix for the given namespace.""" 

416 if namespace is None: 

417 return None 

418 for inverted_nsmap in reversed(self.nsmaps): 

419 if inverted_nsmap is not None and namespace in inverted_nsmap: 

420 return inverted_nsmap[namespace] 

421 return None 

422 

423 def end(self, tag: str | bytes) -> None: 

424 assert self.soup is not None 

425 assert isinstance(tag, str) 

426 self.soup.endData() 

427 namespace, tag = self._getNsTag(tag) 

428 nsprefix = None 

429 if namespace is not None: 

430 for inverted_nsmap in reversed(self.nsmaps): 

431 if inverted_nsmap is not None and namespace in inverted_nsmap: 

432 nsprefix = inverted_nsmap[namespace] 

433 break 

434 self.soup.handle_endtag(tag, nsprefix) 

435 if len(self.nsmaps) > 1: 

436 # This tag, or one of its parents, introduced a namespace 

437 # mapping, so pop it off the stack. 

438 out_of_scope_nsmap = self.nsmaps.pop() 

439 

440 if out_of_scope_nsmap is not None: 

441 # This tag introduced a namespace mapping which is no 

442 # longer in scope. Recalculate the currently active 

443 # namespace prefixes. 

444 self.active_namespace_prefixes.pop() 

445 

446 def pi(self, target: str, data: str) -> None: 

447 assert self.soup is not None 

448 self.soup.endData() 

449 data = target + " " + data 

450 self.soup.handle_data(data) 

451 self.soup.endData(self.processing_instruction_class) 

452 

453 def data(self, data: str | bytes) -> None: 

454 assert self.soup is not None 

455 assert isinstance(data, str) 

456 self.soup.handle_data(data) 

457 

458 def doctype(self, name: str, pubid: str, system: str) -> None: 

459 assert self.soup is not None 

460 self.soup.endData() 

461 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system) 

462 self.soup.handle_data(doctype_string) 

463 self.soup.endData(containerClass=Doctype) 

464 

465 def comment(self, text: str | bytes) -> None: 

466 "Handle comments as Comment objects." 

467 assert self.soup is not None 

468 assert isinstance(text, str) 

469 self.soup.endData() 

470 self.soup.handle_data(text) 

471 self.soup.endData(Comment) 

472 

473 def test_fragment_to_document(self, fragment: str) -> str: 

474 """See `TreeBuilder`.""" 

475 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 

476 

477 

478class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 

479 NAME: str = LXML 

480 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"] 

481 

482 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE] 

483 is_xml: bool = False 

484 

485 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

486 return etree.HTMLParser 

487 

488 def feed(self, markup: _RawMarkup) -> None: 

489 # We know self.soup is set by the time feed() is called. 

490 assert self.soup is not None 

491 encoding = self.soup.original_encoding 

492 try: 

493 self.parser = self.parser_for(encoding) 

494 self.parser.feed(markup) 

495 self.parser.close() 

496 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

497 raise ParserRejectedMarkup(e) 

498 

499 def test_fragment_to_document(self, fragment: str) -> str: 

500 """See `TreeBuilder`.""" 

501 return "<html><body>%s</body></html>" % fragment