Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/

1# encoding: utf-8

2from __future__ import annotations

4# Use of this source code is governed by the MIT license.

5__license__ = "MIT"

7__all__ = [

8 "LXMLTreeBuilderForXML",

9 "LXMLTreeBuilder",

10]

13from typing import (

14 Any,

15 Dict,

16 Iterable,

17 List,

18 Optional,

19 Set,

20 Tuple,

21 Type,

22 TYPE_CHECKING,

23 Union,

24)

26from io import BytesIO

27from io import StringIO

29from typing_extensions import TypeAlias

31from lxml import etree # type:ignore

32from bs4.element import (

33 AttributeDict,

34 XMLAttributeDict,

35 Comment,

36 Doctype,

37 NamespacedAttribute,

38 ProcessingInstruction,

39 XMLProcessingInstruction,

40)

41from bs4.builder import (

42 DetectsXMLParsedAsHTML,

43 FAST,

44 HTML,

45 HTMLTreeBuilder,

46 PERMISSIVE,

47 TreeBuilder,

48 XML,

49)

50from bs4.dammit import EncodingDetector

51from bs4.exceptions import ParserRejectedMarkup

53if TYPE_CHECKING:

54 from bs4._typing import (

55 _Encoding,

56 _Encodings,

57 _NamespacePrefix,

58 _NamespaceURL,

59 _NamespaceMapping,

60 _InvertedNamespaceMapping,

61 _RawMarkup,

62 )

63 from bs4 import BeautifulSoup

65LXML: str = "lxml"

68def _invert(d: dict[Any, Any]) -> dict[Any, Any]:

69 "Invert a dictionary."

70 return dict((v, k) for k, v in list(d.items()))

73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]

74_ParserOrParserClass: TypeAlias = Union[

75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]

76]

79class LXMLTreeBuilderForXML(TreeBuilder):

80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser

82 is_xml: bool = True

84 processing_instruction_class: Type[ProcessingInstruction]

86 NAME: str = "lxml-xml"

87 ALTERNATE_NAMES: Iterable[str] = ["xml"]

89 # Well, it's permissive by XML parser standards.

90 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]

92 CHUNK_SIZE: int = 512

94 # This namespace mapping is specified in the XML Namespace

95 # standard.

96 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")

98 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)

100 nsmaps: List[Optional[_InvertedNamespaceMapping]]

101 empty_element_tags: Optional[Set[str]]

102 parser: Any

103 _default_parser: Optional[etree.XMLParser]

104

105 # NOTE: If we parsed Element objects and looked at .sourceline,

106 # we'd be able to see the line numbers from the original document.

107 # But instead we build an XMLParser or HTMLParser object to serve

108 # as the target of parse messages, and those messages don't include

109 # line numbers.

110 # See: https://bugs.launchpad.net/lxml/+bug/1846906

111

112 def initialize_soup(self, soup: BeautifulSoup) -> None:

113 """Let the BeautifulSoup object know about the standard namespace

114 mapping.

115

116 :param soup: A `BeautifulSoup`.

117 """

118 # Beyond this point, self.soup is set, so we can assume (and

119 # assert) it's not None whenever necessary.

120 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)

121 self._register_namespaces(self.DEFAULT_NSMAPS)

122

123 def _register_namespaces(self, mapping: Dict[str, str]) -> None:

124 """Let the BeautifulSoup object know about namespaces encountered

125 while parsing the document.

126

127 This might be useful later on when creating CSS selectors.

128

129 This will track (almost) all namespaces, even ones that were

130 only in scope for part of the document. If two namespaces have

131 the same prefix, only the first one encountered will be

132 tracked. Un-prefixed namespaces are not tracked.

133

134 :param mapping: A dictionary mapping namespace prefixes to URIs.

135 """

136 assert self.soup is not None

137 for key, value in list(mapping.items()):

138 # This is 'if key' and not 'if key is not None' because we

139 # don't track un-prefixed namespaces. Soupselect will

140 # treat an un-prefixed namespace as the default, which

141 # causes confusion in some cases.

142 if key and key not in self.soup._namespaces:

143 # Let the BeautifulSoup object know about a new namespace.

144 # If there are multiple namespaces defined with the same

145 # prefix, the first one in the document takes precedence.

146 self.soup._namespaces[key] = value

147

148 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

149 """Find the default parser for the given encoding.

150

151 :return: Either a parser object or a class, which

152 will be instantiated with default arguments.

153 """

154 if self._default_parser is not None:

155 return self._default_parser

156 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)

157

158 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:

159 """Instantiate an appropriate parser for the given encoding.

160

161 :param encoding: A string.

162 :return: A parser object such as an `etree.XMLParser`.

163 """

164 # Use the default parser.

165 parser = self.default_parser(encoding)

166

167 if callable(parser):

168 # Instantiate the parser with default arguments

169 parser = parser(target=self, recover=True, encoding=encoding)

170 return parser

171

172 def __init__(

173 self,

174 parser: Optional[etree.XMLParser] = None,

175 empty_element_tags: Optional[Set[str]] = None,

176 **kwargs: Any,

177 ):

178 # TODO: Issue a warning if parser is present but not a

179 # callable, since that means there's no way to create new

180 # parsers for different encodings.

181 self._default_parser = parser

182 self.soup = None

183 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

184 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]

185 if self.is_xml:

186 self.processing_instruction_class = XMLProcessingInstruction

187 else:

188 self.processing_instruction_class = ProcessingInstruction

189

190 if "attribute_dict_class" not in kwargs:

191 kwargs["attribute_dict_class"] = XMLAttributeDict

192 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)

193

194 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:

195 # Split the namespace URL out of a fully-qualified lxml tag

196 # name. Copied from lxml's src/lxml/sax.py.

197 if tag[0] == "{" and "}" in tag:

198 namespace, name = tag[1:].split("}", 1)

199 return (namespace, name)

200 return (None, tag)

201

202 def prepare_markup(

203 self,

204 markup: _RawMarkup,

205 user_specified_encoding: Optional[_Encoding] = None,

206 document_declared_encoding: Optional[_Encoding] = None,

207 exclude_encodings: Optional[_Encodings] = None,

208 ) -> Iterable[

209 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]

210 ]:

211 """Run any preliminary steps necessary to make incoming markup

212 acceptable to the parser.

213

214 lxml really wants to get a bytestring and convert it to

215 Unicode itself. So instead of using UnicodeDammit to convert

216 the bytestring to Unicode using different encodings, this

217 implementation uses EncodingDetector to iterate over the

218 encodings, and tell lxml to try to parse the document as each

219 one in turn.

220

221 :param markup: Some markup -- hopefully a bytestring.

222 :param user_specified_encoding: The user asked to try this encoding.

223 :param document_declared_encoding: The markup itself claims to be

224 in this encoding.

225 :param exclude_encodings: The user asked _not_ to try any of

226 these encodings.

227

228 :yield: A series of 4-tuples: (markup, encoding, declared encoding,

229 has undergone character replacement)

230

231 Each 4-tuple represents a strategy for converting the

232 document to Unicode and parsing it. Each strategy will be tried

233 in turn.

234 """

235 if not self.is_xml:

236 # We're in HTML mode, so if we're given XML, that's worth

237 # noting.

238 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)

239

240 if isinstance(markup, str):

241 # We were given Unicode. Maybe lxml can parse Unicode on

242 # this system?

243

244 # TODO: This is a workaround for

245 # https://bugs.launchpad.net/lxml/+bug/1948551.

246 # We can remove it once the upstream issue is fixed.

247 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":

248 markup = markup[1:]

249 yield markup, None, document_declared_encoding, False

250

251 if isinstance(markup, str):

252 # No, apparently not. Convert the Unicode to UTF-8 and

253 # tell lxml to parse it as UTF-8.

254 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)

255

256 # Since the document was Unicode in the first place, there

257 # is no need to try any more strategies; we know this will

258 # work.

259 return

260

261 known_definite_encodings: List[_Encoding] = []

262 if user_specified_encoding:

263 # This was provided by the end-user; treat it as a known

264 # definite encoding per the algorithm laid out in the

265 # HTML5 spec. (See the EncodingDetector class for

266 # details.)

267 known_definite_encodings.append(user_specified_encoding)

268

269 user_encodings: List[_Encoding] = []

270 if document_declared_encoding:

271 # This was found in the document; treat it as a slightly

272 # lower-priority user encoding.

273 user_encodings.append(document_declared_encoding)

274

275 detector = EncodingDetector(

276 markup,

277 known_definite_encodings=known_definite_encodings,

278 user_encodings=user_encodings,

279 is_html=not self.is_xml,

280 exclude_encodings=exclude_encodings,

281 )

282 for encoding in detector.encodings:

283 yield (detector.markup, encoding, document_declared_encoding, False)

284

285 def feed(self, markup: _RawMarkup) -> None:

286 io: Union[BytesIO, StringIO]

287 if isinstance(markup, bytes):

288 io = BytesIO(markup)

289 elif isinstance(markup, str):

290 io = StringIO(markup)

291

292 # initialize_soup is called before feed, so we know this

293 # is not None.

294 assert self.soup is not None

295

296 # Call feed() at least once, even if the markup is empty,

297 # or the parser won't be initialized.

298 data = io.read(self.CHUNK_SIZE)

299 try:

300 self.parser = self.parser_for(self.soup.original_encoding)

301 self.parser.feed(data)

302 while len(data) != 0:

303 # Now call feed() on the rest of the data, chunk by chunk.

304 data = io.read(self.CHUNK_SIZE)

305 if len(data) != 0:

306 self.parser.feed(data)

307 self.parser.close()

308 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

309 raise ParserRejectedMarkup(e)

310

311 def close(self) -> None:

312 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

313

314 def start(

315 self,

316 tag: str | bytes,

317 attrib: Dict[str | bytes, str | bytes],

318 nsmap: _NamespaceMapping = {},

319 ) -> None:

320 # This is called by lxml code as a result of calling

321 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()

322 # is called.

323 assert self.soup is not None

324 assert isinstance(tag, str)

325

326 # We need to recreate the attribute dict for three

327 # reasons. First, for type checking, so we can assert there

328 # are no bytestrings in the keys or values. Second, because we

329 # need a mutable dict--lxml might send us an immutable

330 # dictproxy. Third, so we can handle namespaced attribute

331 # names by converting the keys to NamespacedAttributes.

332 new_attrib: Dict[Union[str, NamespacedAttribute], str] = (

333 self.attribute_dict_class()

334 )

335 for k, v in attrib.items():

336 assert isinstance(k, str)

337 assert isinstance(v, str)

338 new_attrib[k] = v

339

340 nsprefix: Optional[_NamespacePrefix] = None

341 namespace: Optional[_NamespaceURL] = None

342 # Invert each namespace map as it comes in.

343 if len(nsmap) == 0 and len(self.nsmaps) > 1:

344 # There are no new namespaces for this tag, but

345 # non-default namespaces are in play, so we need a

346 # separate tag stack to know when they end.

347 self.nsmaps.append(None)

348 elif len(nsmap) > 0:

349 # A new namespace mapping has come into play.

350

351 # First, Let the BeautifulSoup object know about it.

352 self._register_namespaces(nsmap)

353

354 # Then, add it to our running list of inverted namespace

355 # mappings.

356 self.nsmaps.append(_invert(nsmap))

357

358 # The currently active namespace prefixes have

359 # changed. Calculate the new mapping so it can be stored

360 # with all Tag objects created while these prefixes are in

361 # scope.

362 current_mapping = dict(self.active_namespace_prefixes[-1])

363 current_mapping.update(nsmap)

364

365 # We should not track un-prefixed namespaces as we can only hold one

366 # and it will be recognized as the default namespace by soupsieve,

367 # which may be confusing in some situations.

368 if "" in current_mapping:

369 del current_mapping[""]

370 self.active_namespace_prefixes.append(current_mapping)

371

372 # Also treat the namespace mapping as a set of attributes on the

373 # tag, so we can recreate it later.

374 for prefix, namespace in list(nsmap.items()):

375 attribute = NamespacedAttribute(

376 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"

377 )

378 new_attrib[attribute] = namespace

379

380 # Namespaces are in play. Find any attributes that came in

381 # from lxml with namespaces attached to their names, and

382 # turn then into NamespacedAttribute objects.

383 final_attrib: AttributeDict = self.attribute_dict_class()

384 for attr, value in list(new_attrib.items()):

385 namespace, attr = self._getNsTag(attr)

386 if namespace is None:

387 final_attrib[attr] = value

388 else:

389 nsprefix = self._prefix_for_namespace(namespace)

390 attr = NamespacedAttribute(nsprefix, attr, namespace)

391 final_attrib[attr] = value

392

393 namespace, tag = self._getNsTag(tag)

394 nsprefix = self._prefix_for_namespace(namespace)

395 self.soup.handle_starttag(

396 tag,

397 namespace,

398 nsprefix,

399 final_attrib,

400 namespaces=self.active_namespace_prefixes[-1],

401 )

402

403 def _prefix_for_namespace(

404 self, namespace: Optional[_NamespaceURL]

405 ) -> Optional[_NamespacePrefix]:

406 """Find the currently active prefix for the given namespace."""

407 if namespace is None:

408 return None

409 for inverted_nsmap in reversed(self.nsmaps):

410 if inverted_nsmap is not None and namespace in inverted_nsmap:

411 return inverted_nsmap[namespace]

412 return None

413

414 def end(self, tag: str | bytes) -> None:

415 assert self.soup is not None

416 assert isinstance(tag, str)

417 self.soup.endData()

418 namespace, tag = self._getNsTag(tag)

419 nsprefix = None

420 if namespace is not None:

421 for inverted_nsmap in reversed(self.nsmaps):

422 if inverted_nsmap is not None and namespace in inverted_nsmap:

423 nsprefix = inverted_nsmap[namespace]

424 break

425 self.soup.handle_endtag(tag, nsprefix)

426 if len(self.nsmaps) > 1:

427 # This tag, or one of its parents, introduced a namespace

428 # mapping, so pop it off the stack.

429 out_of_scope_nsmap = self.nsmaps.pop()

430

431 if out_of_scope_nsmap is not None:

432 # This tag introduced a namespace mapping which is no

433 # longer in scope. Recalculate the currently active

434 # namespace prefixes.

435 self.active_namespace_prefixes.pop()

436

437 def pi(self, target: str, data: str) -> None:

438 assert self.soup is not None

439 self.soup.endData()

440 data = target + " " + data

441 self.soup.handle_data(data)

442 self.soup.endData(self.processing_instruction_class)

443

444 def data(self, data: str | bytes) -> None:

445 assert self.soup is not None

446 assert isinstance(data, str)

447 self.soup.handle_data(data)

448

449 def doctype(self, name: str, pubid: str, system: str) -> None:

450 assert self.soup is not None

451 self.soup.endData()

452 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)

453 self.soup.handle_data(doctype_string)

454 self.soup.endData(containerClass=Doctype)

455

456 def comment(self, text: str | bytes) -> None:

457 "Handle comments as Comment objects."

458 assert self.soup is not None

459 assert isinstance(text, str)

460 self.soup.endData()

461 self.soup.handle_data(text)

462 self.soup.endData(Comment)

463

464 def test_fragment_to_document(self, fragment: str) -> str:

465 """See `TreeBuilder`."""

466 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment

467

468

469class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

470 NAME: str = LXML

471 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]

472

473 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]

474 is_xml: bool = False

475

476 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

477 return etree.HTMLParser

478

479 def feed(self, markup: _RawMarkup) -> None:

480 # We know self.soup is set by the time feed() is called.

481 assert self.soup is not None

482 encoding = self.soup.original_encoding

483 try:

484 self.parser = self.parser_for(encoding)

485 self.parser.feed(markup)

486 self.parser.close()

487 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

488 raise ParserRejectedMarkup(e)

489

490 def test_fragment_to_document(self, fragment: str) -> str:

491 """See `TreeBuilder`."""

492 return "<html><body>%s</body></html>" % fragment

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 4%

205 statements