Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/

1# encoding: utf-8

2from __future__ import annotations

4# Use of this source code is governed by the MIT license.

5__license__ = "MIT"

7__all__ = [

8 "LXMLTreeBuilderForXML",

9 "LXMLTreeBuilder",

10]

13from typing import (

14 Any,

15 Dict,

16 Iterable,

17 List,

18 Optional,

19 Set,

20 Tuple,

21 Type,

22 TYPE_CHECKING,

23 Union,

24)

25from typing_extensions import TypeAlias

27from io import BytesIO

28from io import StringIO

29from lxml import etree

30from bs4.element import (

31 AttributeDict,

32 XMLAttributeDict,

33 Comment,

34 Doctype,

35 NamespacedAttribute,

36 ProcessingInstruction,

37 XMLProcessingInstruction,

38)

39from bs4.builder import (

40 DetectsXMLParsedAsHTML,

41 FAST,

42 HTML,

43 HTMLTreeBuilder,

44 PERMISSIVE,

45 TreeBuilder,

46 XML,

47)

48from bs4.dammit import EncodingDetector

49from bs4.exceptions import ParserRejectedMarkup

51if TYPE_CHECKING:

52 from bs4._typing import (

53 _Encoding,

54 _Encodings,

55 _NamespacePrefix,

56 _NamespaceURL,

57 _NamespaceMapping,

58 _InvertedNamespaceMapping,

59 _RawMarkup,

60 )

61 from bs4 import BeautifulSoup

63LXML: str = "lxml"

66def _invert(d: dict[Any, Any]) -> dict[Any, Any]:

67 "Invert a dictionary."

68 return dict((v, k) for k, v in list(d.items()))

71_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]

72_ParserOrParserClass: TypeAlias = Union[

73 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]

74]

77class LXMLTreeBuilderForXML(TreeBuilder):

78 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser

80 is_xml: bool = True

82 processing_instruction_class: Type[ProcessingInstruction]

84 NAME: str = "lxml-xml"

85 ALTERNATE_NAMES: Iterable[str] = ["xml"]

87 # Well, it's permissive by XML parser standards.

88 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]

90 CHUNK_SIZE: int = 512

92 # This namespace mapping is specified in the XML Namespace

93 # standard.

94 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")

96 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)

98 nsmaps: List[Optional[_InvertedNamespaceMapping]]

99 empty_element_tags: Set[str]

100 parser: Any

101 _default_parser: Optional[etree.XMLParser]

102

103 # NOTE: If we parsed Element objects and looked at .sourceline,

104 # we'd be able to see the line numbers from the original document.

105 # But instead we build an XMLParser or HTMLParser object to serve

106 # as the target of parse messages, and those messages don't include

107 # line numbers.

108 # See: https://bugs.launchpad.net/lxml/+bug/1846906

109

110 def initialize_soup(self, soup: BeautifulSoup) -> None:

111 """Let the BeautifulSoup object know about the standard namespace

112 mapping.

113

114 :param soup: A `BeautifulSoup`.

115 """

116 # Beyond this point, self.soup is set, so we can assume (and

117 # assert) it's not None whenever necessary.

118 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)

119 self._register_namespaces(self.DEFAULT_NSMAPS)

120

121 def _register_namespaces(self, mapping: Dict[str, str]) -> None:

122 """Let the BeautifulSoup object know about namespaces encountered

123 while parsing the document.

124

125 This might be useful later on when creating CSS selectors.

126

127 This will track (almost) all namespaces, even ones that were

128 only in scope for part of the document. If two namespaces have

129 the same prefix, only the first one encountered will be

130 tracked. Un-prefixed namespaces are not tracked.

131

132 :param mapping: A dictionary mapping namespace prefixes to URIs.

133 """

134 assert self.soup is not None

135 for key, value in list(mapping.items()):

136 # This is 'if key' and not 'if key is not None' because we

137 # don't track un-prefixed namespaces. Soupselect will

138 # treat an un-prefixed namespace as the default, which

139 # causes confusion in some cases.

140 if key and key not in self.soup._namespaces:

141 # Let the BeautifulSoup object know about a new namespace.

142 # If there are multiple namespaces defined with the same

143 # prefix, the first one in the document takes precedence.

144 self.soup._namespaces[key] = value

145

146 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

147 """Find the default parser for the given encoding.

148

149 :return: Either a parser object or a class, which

150 will be instantiated with default arguments.

151 """

152 if self._default_parser is not None:

153 return self._default_parser

154 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)

155

156 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:

157 """Instantiate an appropriate parser for the given encoding.

158

159 :param encoding: A string.

160 :return: A parser object such as an `etree.XMLParser`.

161 """

162 # Use the default parser.

163 parser = self.default_parser(encoding)

164

165 if callable(parser):

166 # Instantiate the parser with default arguments

167 parser = parser(target=self, recover=True, encoding=encoding)

168 return parser

169

170 def __init__(

171 self,

172 parser: Optional[etree.XMLParser] = None,

173 empty_element_tags: Optional[Set[str]] = None,

174 **kwargs: Any,

175 ):

176 # TODO: Issue a warning if parser is present but not a

177 # callable, since that means there's no way to create new

178 # parsers for different encodings.

179 self._default_parser = parser

180 self.soup = None

181 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

182 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]

183 if self.is_xml:

184 self.processing_instruction_class = XMLProcessingInstruction

185 else:

186 self.processing_instruction_class = ProcessingInstruction

187

188 if "attribute_dict_class" not in kwargs:

189 kwargs["attribute_dict_class"] = XMLAttributeDict

190 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)

191

192 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:

193 # Split the namespace URL out of a fully-qualified lxml tag

194 # name. Copied from lxml's src/lxml/sax.py.

195 if tag[0] == "{":

196 namespace, name = tag[1:].split("}", 1)

197 return (namespace, name)

198 else:

199 return (None, tag)

200

201 def prepare_markup(

202 self,

203 markup: _RawMarkup,

204 user_specified_encoding: Optional[_Encoding] = None,

205 document_declared_encoding: Optional[_Encoding] = None,

206 exclude_encodings: Optional[_Encodings] = None,

207 ) -> Iterable[

208 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]

209 ]:

210 """Run any preliminary steps necessary to make incoming markup

211 acceptable to the parser.

212

213 lxml really wants to get a bytestring and convert it to

214 Unicode itself. So instead of using UnicodeDammit to convert

215 the bytestring to Unicode using different encodings, this

216 implementation uses EncodingDetector to iterate over the

217 encodings, and tell lxml to try to parse the document as each

218 one in turn.

219

220 :param markup: Some markup -- hopefully a bytestring.

221 :param user_specified_encoding: The user asked to try this encoding.

222 :param document_declared_encoding: The markup itself claims to be

223 in this encoding.

224 :param exclude_encodings: The user asked _not_ to try any of

225 these encodings.

226

227 :yield: A series of 4-tuples: (markup, encoding, declared encoding,

228 has undergone character replacement)

229

230 Each 4-tuple represents a strategy for converting the

231 document to Unicode and parsing it. Each strategy will be tried

232 in turn.

233 """

234 if not self.is_xml:

235 # We're in HTML mode, so if we're given XML, that's worth

236 # noting.

237 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)

238

239 if isinstance(markup, str):

240 # We were given Unicode. Maybe lxml can parse Unicode on

241 # this system?

242

243 # TODO: This is a workaround for

244 # https://bugs.launchpad.net/lxml/+bug/1948551.

245 # We can remove it once the upstream issue is fixed.

246 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":

247 markup = markup[1:]

248 yield markup, None, document_declared_encoding, False

249

250 if isinstance(markup, str):

251 # No, apparently not. Convert the Unicode to UTF-8 and

252 # tell lxml to parse it as UTF-8.

253 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)

254

255 # Since the document was Unicode in the first place, there

256 # is no need to try any more strategies; we know this will

257 # work.

258 return

259

260 known_definite_encodings: List[_Encoding] = []

261 if user_specified_encoding:

262 # This was provided by the end-user; treat it as a known

263 # definite encoding per the algorithm laid out in the

264 # HTML5 spec. (See the EncodingDetector class for

265 # details.)

266 known_definite_encodings.append(user_specified_encoding)

267

268 user_encodings: List[_Encoding] = []

269 if document_declared_encoding:

270 # This was found in the document; treat it as a slightly

271 # lower-priority user encoding.

272 user_encodings.append(document_declared_encoding)

273

274 detector = EncodingDetector(

275 markup,

276 known_definite_encodings=known_definite_encodings,

277 user_encodings=user_encodings,

278 is_html=not self.is_xml,

279 exclude_encodings=exclude_encodings,

280 )

281 for encoding in detector.encodings:

282 yield (detector.markup, encoding, document_declared_encoding, False)

283

284 def feed(self, markup: _RawMarkup) -> None:

285 io: Union[BytesIO, StringIO]

286 if isinstance(markup, bytes):

287 io = BytesIO(markup)

288 elif isinstance(markup, str):

289 io = StringIO(markup)

290

291 # initialize_soup is called before feed, so we know this

292 # is not None.

293 assert self.soup is not None

294

295 # Call feed() at least once, even if the markup is empty,

296 # or the parser won't be initialized.

297 data = io.read(self.CHUNK_SIZE)

298 try:

299 self.parser = self.parser_for(self.soup.original_encoding)

300 self.parser.feed(data)

301 while len(data) != 0:

302 # Now call feed() on the rest of the data, chunk by chunk.

303 data = io.read(self.CHUNK_SIZE)

304 if len(data) != 0:

305 self.parser.feed(data)

306 self.parser.close()

307 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

308 raise ParserRejectedMarkup(e)

309

310 def close(self) -> None:

311 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

312

313 def start(

314 self,

315 tag: str | bytes,

316 attrs: Dict[str | bytes, str | bytes],

317 nsmap: _NamespaceMapping = {},

318 ) -> None:

319 # This is called by lxml code as a result of calling

320 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()

321 # is called.

322 assert self.soup is not None

323 assert isinstance(tag, str)

324

325 # We need to recreate the attribute dict for three

326 # reasons. First, for type checking, so we can assert there

327 # are no bytestrings in the keys or values. Second, because we

328 # need a mutable dict--lxml might send us an immutable

329 # dictproxy. Third, so we can handle namespaced attribute

330 # names by converting the keys to NamespacedAttributes.

331 new_attrs: Dict[Union[str, NamespacedAttribute], str] = (

332 self.attribute_dict_class()

333 )

334 for k, v in attrs.items():

335 assert isinstance(k, str)

336 assert isinstance(v, str)

337 new_attrs[k] = v

338

339 nsprefix: Optional[_NamespacePrefix] = None

340 namespace: Optional[_NamespaceURL] = None

341 # Invert each namespace map as it comes in.

342 if len(nsmap) == 0 and len(self.nsmaps) > 1:

343 # There are no new namespaces for this tag, but

344 # non-default namespaces are in play, so we need a

345 # separate tag stack to know when they end.

346 self.nsmaps.append(None)

347 elif len(nsmap) > 0:

348 # A new namespace mapping has come into play.

349

350 # First, Let the BeautifulSoup object know about it.

351 self._register_namespaces(nsmap)

352

353 # Then, add it to our running list of inverted namespace

354 # mappings.

355 self.nsmaps.append(_invert(nsmap))

356

357 # The currently active namespace prefixes have

358 # changed. Calculate the new mapping so it can be stored

359 # with all Tag objects created while these prefixes are in

360 # scope.

361 current_mapping = dict(self.active_namespace_prefixes[-1])

362 current_mapping.update(nsmap)

363

364 # We should not track un-prefixed namespaces as we can only hold one

365 # and it will be recognized as the default namespace by soupsieve,

366 # which may be confusing in some situations.

367 if "" in current_mapping:

368 del current_mapping[""]

369 self.active_namespace_prefixes.append(current_mapping)

370

371 # Also treat the namespace mapping as a set of attributes on the

372 # tag, so we can recreate it later.

373 for prefix, namespace in list(nsmap.items()):

374 attribute = NamespacedAttribute(

375 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"

376 )

377 new_attrs[attribute] = namespace

378

379 # Namespaces are in play. Find any attributes that came in

380 # from lxml with namespaces attached to their names, and

381 # turn then into NamespacedAttribute objects.

382 final_attrs: AttributeDict = self.attribute_dict_class()

383 for attr, value in list(new_attrs.items()):

384 namespace, attr = self._getNsTag(attr)

385 if namespace is None:

386 final_attrs[attr] = value

387 else:

388 nsprefix = self._prefix_for_namespace(namespace)

389 attr = NamespacedAttribute(nsprefix, attr, namespace)

390 final_attrs[attr] = value

391

392 namespace, tag = self._getNsTag(tag)

393 nsprefix = self._prefix_for_namespace(namespace)

394 self.soup.handle_starttag(

395 tag,

396 namespace,

397 nsprefix,

398 final_attrs,

399 namespaces=self.active_namespace_prefixes[-1],

400 )

401

402 def _prefix_for_namespace(

403 self, namespace: Optional[_NamespaceURL]

404 ) -> Optional[_NamespacePrefix]:

405 """Find the currently active prefix for the given namespace."""

406 if namespace is None:

407 return None

408 for inverted_nsmap in reversed(self.nsmaps):

409 if inverted_nsmap is not None and namespace in inverted_nsmap:

410 return inverted_nsmap[namespace]

411 return None

412

413 def end(self, name: str | bytes) -> None:

414 assert self.soup is not None

415 assert isinstance(name, str)

416 self.soup.endData()

417 namespace, name = self._getNsTag(name)

418 nsprefix = None

419 if namespace is not None:

420 for inverted_nsmap in reversed(self.nsmaps):

421 if inverted_nsmap is not None and namespace in inverted_nsmap:

422 nsprefix = inverted_nsmap[namespace]

423 break

424 self.soup.handle_endtag(name, nsprefix)

425 if len(self.nsmaps) > 1:

426 # This tag, or one of its parents, introduced a namespace

427 # mapping, so pop it off the stack.

428 out_of_scope_nsmap = self.nsmaps.pop()

429

430 if out_of_scope_nsmap is not None:

431 # This tag introduced a namespace mapping which is no

432 # longer in scope. Recalculate the currently active

433 # namespace prefixes.

434 self.active_namespace_prefixes.pop()

435

436 def pi(self, target: str, data: str) -> None:

437 assert self.soup is not None

438 self.soup.endData()

439 data = target + " " + data

440 self.soup.handle_data(data)

441 self.soup.endData(self.processing_instruction_class)

442

443 def data(self, data: str | bytes) -> None:

444 assert self.soup is not None

445 assert isinstance(data, str)

446 self.soup.handle_data(data)

447

448 def doctype(self, name: str, pubid: str, system: str) -> None:

449 assert self.soup is not None

450 self.soup.endData()

451 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)

452 self.soup.handle_data(doctype_string)

453 self.soup.endData(containerClass=Doctype)

454

455 def comment(self, text: str | bytes) -> None:

456 "Handle comments as Comment objects."

457 assert self.soup is not None

458 assert isinstance(text, str)

459 self.soup.endData()

460 self.soup.handle_data(text)

461 self.soup.endData(Comment)

462

463 def test_fragment_to_document(self, fragment: str) -> str:

464 """See `TreeBuilder`."""

465 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment

466

467

468class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

469 NAME: str = LXML

470 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]

471

472 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]

473 is_xml: bool = False

474

475 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

476 return etree.HTMLParser

477

478 def feed(self, markup: _RawMarkup) -> None:

479 # We know self.soup is set by the time feed() is called.

480 assert self.soup is not None

481 encoding = self.soup.original_encoding

482 try:

483 self.parser = self.parser_for(encoding)

484 self.parser.feed(markup)

485 self.parser.close()

486 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

487 raise ParserRejectedMarkup(e)

488

489 def test_fragment_to_document(self, fragment: str) -> str:

490 """See `TreeBuilder`."""

491 return "<html><body>%s</body></html>" % fragment

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%

205 statements