Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

205 statements  

1# encoding: utf-8 

2from __future__ import annotations 

3 

4# Use of this source code is governed by the MIT license. 

5__license__ = "MIT" 

6 

7__all__ = [ 

8 "LXMLTreeBuilderForXML", 

9 "LXMLTreeBuilder", 

10] 

11 

12 

13from typing import ( 

14 Any, 

15 Dict, 

16 Iterable, 

17 List, 

18 Optional, 

19 Set, 

20 Tuple, 

21 Type, 

22 TYPE_CHECKING, 

23 Union, 

24) 

25from typing_extensions import TypeAlias 

26 

27from io import BytesIO 

28from io import StringIO 

29from lxml import etree 

30from bs4.element import ( 

31 AttributeDict, 

32 XMLAttributeDict, 

33 Comment, 

34 Doctype, 

35 NamespacedAttribute, 

36 ProcessingInstruction, 

37 XMLProcessingInstruction, 

38) 

39from bs4.builder import ( 

40 DetectsXMLParsedAsHTML, 

41 FAST, 

42 HTML, 

43 HTMLTreeBuilder, 

44 PERMISSIVE, 

45 TreeBuilder, 

46 XML, 

47) 

48from bs4.dammit import EncodingDetector 

49from bs4.exceptions import ParserRejectedMarkup 

50 

51if TYPE_CHECKING: 

52 from bs4._typing import ( 

53 _Encoding, 

54 _Encodings, 

55 _NamespacePrefix, 

56 _NamespaceURL, 

57 _NamespaceMapping, 

58 _InvertedNamespaceMapping, 

59 _RawMarkup, 

60 ) 

61 from bs4 import BeautifulSoup 

62 

63LXML: str = "lxml" 

64 

65 

66def _invert(d: dict[Any, Any]) -> dict[Any, Any]: 

67 "Invert a dictionary." 

68 return dict((v, k) for k, v in list(d.items())) 

69 

70 

71_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser] 

72_ParserOrParserClass: TypeAlias = Union[ 

73 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser] 

74] 

75 

76 

77class LXMLTreeBuilderForXML(TreeBuilder): 

78 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser 

79 

80 is_xml: bool = True 

81 

82 processing_instruction_class: Type[ProcessingInstruction] 

83 

84 NAME: str = "lxml-xml" 

85 ALTERNATE_NAMES: Iterable[str] = ["xml"] 

86 

87 # Well, it's permissive by XML parser standards. 

88 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE] 

89 

90 CHUNK_SIZE: int = 512 

91 

92 # This namespace mapping is specified in the XML Namespace 

93 # standard. 

94 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace") 

95 

96 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS) 

97 

98 nsmaps: List[Optional[_InvertedNamespaceMapping]] 

99 empty_element_tags: Set[str] 

100 parser: Any 

101 _default_parser: Optional[etree.XMLParser] 

102 

103 # NOTE: If we parsed Element objects and looked at .sourceline, 

104 # we'd be able to see the line numbers from the original document. 

105 # But instead we build an XMLParser or HTMLParser object to serve 

106 # as the target of parse messages, and those messages don't include 

107 # line numbers. 

108 # See: https://bugs.launchpad.net/lxml/+bug/1846906 

109 

110 def initialize_soup(self, soup: BeautifulSoup) -> None: 

111 """Let the BeautifulSoup object know about the standard namespace 

112 mapping. 

113 

114 :param soup: A `BeautifulSoup`. 

115 """ 

116 # Beyond this point, self.soup is set, so we can assume (and 

117 # assert) it's not None whenever necessary. 

118 super(LXMLTreeBuilderForXML, self).initialize_soup(soup) 

119 self._register_namespaces(self.DEFAULT_NSMAPS) 

120 

121 def _register_namespaces(self, mapping: Dict[str, str]) -> None: 

122 """Let the BeautifulSoup object know about namespaces encountered 

123 while parsing the document. 

124 

125 This might be useful later on when creating CSS selectors. 

126 

127 This will track (almost) all namespaces, even ones that were 

128 only in scope for part of the document. If two namespaces have 

129 the same prefix, only the first one encountered will be 

130 tracked. Un-prefixed namespaces are not tracked. 

131 

132 :param mapping: A dictionary mapping namespace prefixes to URIs. 

133 """ 

134 assert self.soup is not None 

135 for key, value in list(mapping.items()): 

136 # This is 'if key' and not 'if key is not None' because we 

137 # don't track un-prefixed namespaces. Soupselect will 

138 # treat an un-prefixed namespace as the default, which 

139 # causes confusion in some cases. 

140 if key and key not in self.soup._namespaces: 

141 # Let the BeautifulSoup object know about a new namespace. 

142 # If there are multiple namespaces defined with the same 

143 # prefix, the first one in the document takes precedence. 

144 self.soup._namespaces[key] = value 

145 

146 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

147 """Find the default parser for the given encoding. 

148 

149 :return: Either a parser object or a class, which 

150 will be instantiated with default arguments. 

151 """ 

152 if self._default_parser is not None: 

153 return self._default_parser 

154 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding) 

155 

156 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser: 

157 """Instantiate an appropriate parser for the given encoding. 

158 

159 :param encoding: A string. 

160 :return: A parser object such as an `etree.XMLParser`. 

161 """ 

162 # Use the default parser. 

163 parser = self.default_parser(encoding) 

164 

165 if callable(parser): 

166 # Instantiate the parser with default arguments 

167 parser = parser(target=self, recover=True, encoding=encoding) 

168 return parser 

169 

170 def __init__( 

171 self, 

172 parser: Optional[etree.XMLParser] = None, 

173 empty_element_tags: Optional[Set[str]] = None, 

174 **kwargs: Any, 

175 ): 

176 # TODO: Issue a warning if parser is present but not a 

177 # callable, since that means there's no way to create new 

178 # parsers for different encodings. 

179 self._default_parser = parser 

180 self.soup = None 

181 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

182 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] 

183 if self.is_xml: 

184 self.processing_instruction_class = XMLProcessingInstruction 

185 else: 

186 self.processing_instruction_class = ProcessingInstruction 

187 

188 if "attribute_dict_class" not in kwargs: 

189 kwargs["attribute_dict_class"] = XMLAttributeDict 

190 super(LXMLTreeBuilderForXML, self).__init__(**kwargs) 

191 

192 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]: 

193 # Split the namespace URL out of a fully-qualified lxml tag 

194 # name. Copied from lxml's src/lxml/sax.py. 

195 if tag[0] == "{": 

196 namespace, name = tag[1:].split("}", 1) 

197 return (namespace, name) 

198 else: 

199 return (None, tag) 

200 

201 def prepare_markup( 

202 self, 

203 markup: _RawMarkup, 

204 user_specified_encoding: Optional[_Encoding] = None, 

205 document_declared_encoding: Optional[_Encoding] = None, 

206 exclude_encodings: Optional[_Encodings] = None, 

207 ) -> Iterable[ 

208 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool] 

209 ]: 

210 """Run any preliminary steps necessary to make incoming markup 

211 acceptable to the parser. 

212 

213 lxml really wants to get a bytestring and convert it to 

214 Unicode itself. So instead of using UnicodeDammit to convert 

215 the bytestring to Unicode using different encodings, this 

216 implementation uses EncodingDetector to iterate over the 

217 encodings, and tell lxml to try to parse the document as each 

218 one in turn. 

219 

220 :param markup: Some markup -- hopefully a bytestring. 

221 :param user_specified_encoding: The user asked to try this encoding. 

222 :param document_declared_encoding: The markup itself claims to be 

223 in this encoding. 

224 :param exclude_encodings: The user asked _not_ to try any of 

225 these encodings. 

226 

227 :yield: A series of 4-tuples: (markup, encoding, declared encoding, 

228 has undergone character replacement) 

229 

230 Each 4-tuple represents a strategy for converting the 

231 document to Unicode and parsing it. Each strategy will be tried 

232 in turn. 

233 """ 

234 if not self.is_xml: 

235 # We're in HTML mode, so if we're given XML, that's worth 

236 # noting. 

237 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) 

238 

239 if isinstance(markup, str): 

240 # We were given Unicode. Maybe lxml can parse Unicode on 

241 # this system? 

242 

243 # TODO: This is a workaround for 

244 # https://bugs.launchpad.net/lxml/+bug/1948551. 

245 # We can remove it once the upstream issue is fixed. 

246 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}": 

247 markup = markup[1:] 

248 yield markup, None, document_declared_encoding, False 

249 

250 if isinstance(markup, str): 

251 # No, apparently not. Convert the Unicode to UTF-8 and 

252 # tell lxml to parse it as UTF-8. 

253 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) 

254 

255 # Since the document was Unicode in the first place, there 

256 # is no need to try any more strategies; we know this will 

257 # work. 

258 return 

259 

260 known_definite_encodings: List[_Encoding] = [] 

261 if user_specified_encoding: 

262 # This was provided by the end-user; treat it as a known 

263 # definite encoding per the algorithm laid out in the 

264 # HTML5 spec. (See the EncodingDetector class for 

265 # details.) 

266 known_definite_encodings.append(user_specified_encoding) 

267 

268 user_encodings: List[_Encoding] = [] 

269 if document_declared_encoding: 

270 # This was found in the document; treat it as a slightly 

271 # lower-priority user encoding. 

272 user_encodings.append(document_declared_encoding) 

273 

274 detector = EncodingDetector( 

275 markup, 

276 known_definite_encodings=known_definite_encodings, 

277 user_encodings=user_encodings, 

278 is_html=not self.is_xml, 

279 exclude_encodings=exclude_encodings, 

280 ) 

281 for encoding in detector.encodings: 

282 yield (detector.markup, encoding, document_declared_encoding, False) 

283 

284 def feed(self, markup: _RawMarkup) -> None: 

285 io: Union[BytesIO, StringIO] 

286 if isinstance(markup, bytes): 

287 io = BytesIO(markup) 

288 elif isinstance(markup, str): 

289 io = StringIO(markup) 

290 

291 # initialize_soup is called before feed, so we know this 

292 # is not None. 

293 assert self.soup is not None 

294 

295 # Call feed() at least once, even if the markup is empty, 

296 # or the parser won't be initialized. 

297 data = io.read(self.CHUNK_SIZE) 

298 try: 

299 self.parser = self.parser_for(self.soup.original_encoding) 

300 self.parser.feed(data) 

301 while len(data) != 0: 

302 # Now call feed() on the rest of the data, chunk by chunk. 

303 data = io.read(self.CHUNK_SIZE) 

304 if len(data) != 0: 

305 self.parser.feed(data) 

306 self.parser.close() 

307 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

308 raise ParserRejectedMarkup(e) 

309 

310 def close(self) -> None: 

311 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] 

312 

313 def start( 

314 self, 

315 tag: str | bytes, 

316 attrs: Dict[str | bytes, str | bytes], 

317 nsmap: _NamespaceMapping = {}, 

318 ) -> None: 

319 # This is called by lxml code as a result of calling 

320 # BeautifulSoup.feed(), and we know self.soup is set by the time feed() 

321 # is called. 

322 assert self.soup is not None 

323 assert isinstance(tag, str) 

324 

325 # We need to recreate the attribute dict for three 

326 # reasons. First, for type checking, so we can assert there 

327 # are no bytestrings in the keys or values. Second, because we 

328 # need a mutable dict--lxml might send us an immutable 

329 # dictproxy. Third, so we can handle namespaced attribute 

330 # names by converting the keys to NamespacedAttributes. 

331 new_attrs: Dict[Union[str, NamespacedAttribute], str] = ( 

332 self.attribute_dict_class() 

333 ) 

334 for k, v in attrs.items(): 

335 assert isinstance(k, str) 

336 assert isinstance(v, str) 

337 new_attrs[k] = v 

338 

339 nsprefix: Optional[_NamespacePrefix] = None 

340 namespace: Optional[_NamespaceURL] = None 

341 # Invert each namespace map as it comes in. 

342 if len(nsmap) == 0 and len(self.nsmaps) > 1: 

343 # There are no new namespaces for this tag, but 

344 # non-default namespaces are in play, so we need a 

345 # separate tag stack to know when they end. 

346 self.nsmaps.append(None) 

347 elif len(nsmap) > 0: 

348 # A new namespace mapping has come into play. 

349 

350 # First, Let the BeautifulSoup object know about it. 

351 self._register_namespaces(nsmap) 

352 

353 # Then, add it to our running list of inverted namespace 

354 # mappings. 

355 self.nsmaps.append(_invert(nsmap)) 

356 

357 # The currently active namespace prefixes have 

358 # changed. Calculate the new mapping so it can be stored 

359 # with all Tag objects created while these prefixes are in 

360 # scope. 

361 current_mapping = dict(self.active_namespace_prefixes[-1]) 

362 current_mapping.update(nsmap) 

363 

364 # We should not track un-prefixed namespaces as we can only hold one 

365 # and it will be recognized as the default namespace by soupsieve, 

366 # which may be confusing in some situations. 

367 if "" in current_mapping: 

368 del current_mapping[""] 

369 self.active_namespace_prefixes.append(current_mapping) 

370 

371 # Also treat the namespace mapping as a set of attributes on the 

372 # tag, so we can recreate it later. 

373 for prefix, namespace in list(nsmap.items()): 

374 attribute = NamespacedAttribute( 

375 "xmlns", prefix, "http://www.w3.org/2000/xmlns/" 

376 ) 

377 new_attrs[attribute] = namespace 

378 

379 # Namespaces are in play. Find any attributes that came in 

380 # from lxml with namespaces attached to their names, and 

381 # turn then into NamespacedAttribute objects. 

382 final_attrs: AttributeDict = self.attribute_dict_class() 

383 for attr, value in list(new_attrs.items()): 

384 namespace, attr = self._getNsTag(attr) 

385 if namespace is None: 

386 final_attrs[attr] = value 

387 else: 

388 nsprefix = self._prefix_for_namespace(namespace) 

389 attr = NamespacedAttribute(nsprefix, attr, namespace) 

390 final_attrs[attr] = value 

391 

392 namespace, tag = self._getNsTag(tag) 

393 nsprefix = self._prefix_for_namespace(namespace) 

394 self.soup.handle_starttag( 

395 tag, 

396 namespace, 

397 nsprefix, 

398 final_attrs, 

399 namespaces=self.active_namespace_prefixes[-1], 

400 ) 

401 

402 def _prefix_for_namespace( 

403 self, namespace: Optional[_NamespaceURL] 

404 ) -> Optional[_NamespacePrefix]: 

405 """Find the currently active prefix for the given namespace.""" 

406 if namespace is None: 

407 return None 

408 for inverted_nsmap in reversed(self.nsmaps): 

409 if inverted_nsmap is not None and namespace in inverted_nsmap: 

410 return inverted_nsmap[namespace] 

411 return None 

412 

413 def end(self, name: str | bytes) -> None: 

414 assert self.soup is not None 

415 assert isinstance(name, str) 

416 self.soup.endData() 

417 namespace, name = self._getNsTag(name) 

418 nsprefix = None 

419 if namespace is not None: 

420 for inverted_nsmap in reversed(self.nsmaps): 

421 if inverted_nsmap is not None and namespace in inverted_nsmap: 

422 nsprefix = inverted_nsmap[namespace] 

423 break 

424 self.soup.handle_endtag(name, nsprefix) 

425 if len(self.nsmaps) > 1: 

426 # This tag, or one of its parents, introduced a namespace 

427 # mapping, so pop it off the stack. 

428 out_of_scope_nsmap = self.nsmaps.pop() 

429 

430 if out_of_scope_nsmap is not None: 

431 # This tag introduced a namespace mapping which is no 

432 # longer in scope. Recalculate the currently active 

433 # namespace prefixes. 

434 self.active_namespace_prefixes.pop() 

435 

436 def pi(self, target: str, data: str) -> None: 

437 assert self.soup is not None 

438 self.soup.endData() 

439 data = target + " " + data 

440 self.soup.handle_data(data) 

441 self.soup.endData(self.processing_instruction_class) 

442 

443 def data(self, data: str | bytes) -> None: 

444 assert self.soup is not None 

445 assert isinstance(data, str) 

446 self.soup.handle_data(data) 

447 

448 def doctype(self, name: str, pubid: str, system: str) -> None: 

449 assert self.soup is not None 

450 self.soup.endData() 

451 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system) 

452 self.soup.handle_data(doctype_string) 

453 self.soup.endData(containerClass=Doctype) 

454 

455 def comment(self, text: str | bytes) -> None: 

456 "Handle comments as Comment objects." 

457 assert self.soup is not None 

458 assert isinstance(text, str) 

459 self.soup.endData() 

460 self.soup.handle_data(text) 

461 self.soup.endData(Comment) 

462 

463 def test_fragment_to_document(self, fragment: str) -> str: 

464 """See `TreeBuilder`.""" 

465 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 

466 

467 

468class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 

469 NAME: str = LXML 

470 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"] 

471 

472 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE] 

473 is_xml: bool = False 

474 

475 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: 

476 return etree.HTMLParser 

477 

478 def feed(self, markup: _RawMarkup) -> None: 

479 # We know self.soup is set by the time feed() is called. 

480 assert self.soup is not None 

481 encoding = self.soup.original_encoding 

482 try: 

483 self.parser = self.parser_for(encoding) 

484 self.parser.feed(markup) 

485 self.parser.close() 

486 except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 

487 raise ParserRejectedMarkup(e) 

488 

489 def test_fragment_to_document(self, fragment: str) -> str: 

490 """See `TreeBuilder`.""" 

491 return "<html><body>%s</body></html>" % fragment