Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/

1# encoding: utf-8

2from __future__ import annotations

4# Use of this source code is governed by the MIT license.

5__license__ = "MIT"

7__all__ = [

8 "LXMLTreeBuilderForXML",

9 "LXMLTreeBuilder",

10]

13from typing import (

14 Any,

15 Dict,

16 Iterable,

17 List,

18 Optional,

19 Set,

20 Tuple,

21 Type,

22 TYPE_CHECKING,

23 Union,

24)

26from io import BytesIO

27from io import StringIO

29from typing_extensions import TypeAlias

31from lxml import etree # type:ignore

32from bs4.element import (

33 AttributeDict,

34 XMLAttributeDict,

35 Comment,

36 Doctype,

37 NamespacedAttribute,

38 ProcessingInstruction,

39 XMLProcessingInstruction,

40)

41from bs4.builder import (

42 DetectsXMLParsedAsHTML,

43 FAST,

44 HTML,

45 HTMLTreeBuilder,

46 PERMISSIVE,

47 TreeBuilder,

48 XML,

49)

50from bs4.dammit import EncodingDetector

51from bs4.exceptions import ParserRejectedMarkup

53if TYPE_CHECKING:

54 from bs4._typing import (

55 _Encoding,

56 _Encodings,

57 _NamespacePrefix,

58 _NamespaceURL,

59 _NamespaceMapping,

60 _InvertedNamespaceMapping,

61 _RawMarkup,

62 )

63 from bs4 import BeautifulSoup

65LXML: str = "lxml"

68def _invert(d: dict[Any, Any]) -> dict[Any, Any]:

69 "Invert a dictionary."

70 return dict((v, k) for k, v in list(d.items()))

73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]

74_ParserOrParserClass: TypeAlias = Union[

75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]

76]

79class LXMLTreeBuilderForXML(TreeBuilder):

80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser

82 is_xml: bool = True

84 #: Set this to true (probably by passing huge_tree=True into the :

85 #: BeautifulSoup constructor) to enable the lxml feature "disable security

86 #: restrictions and support very deep trees and very long text

87 #: content".

88 huge_tree: bool

90 processing_instruction_class: Type[ProcessingInstruction]

92 NAME: str = "lxml-xml"

93 ALTERNATE_NAMES: Iterable[str] = ["xml"]

95 # Well, it's permissive by XML parser standards.

96 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]

98 CHUNK_SIZE: int = 512

100 # This namespace mapping is specified in the XML Namespace

101 # standard.

102 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")

103

104 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)

105

106 nsmaps: List[Optional[_InvertedNamespaceMapping]]

107 empty_element_tags: Optional[Set[str]]

108 parser: Any

109 _default_parser: Optional[etree.XMLParser]

110

111 # NOTE: If we parsed Element objects and looked at .sourceline,

112 # we'd be able to see the line numbers from the original document.

113 # But instead we build an XMLParser or HTMLParser object to serve

114 # as the target of parse messages, and those messages don't include

115 # line numbers.

116 # See: https://bugs.launchpad.net/lxml/+bug/1846906

117

118 def initialize_soup(self, soup: BeautifulSoup) -> None:

119 """Let the BeautifulSoup object know about the standard namespace

120 mapping.

121

122 :param soup: A `BeautifulSoup`.

123 """

124 # Beyond this point, self.soup is set, so we can assume (and

125 # assert) it's not None whenever necessary.

126 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)

127 self._register_namespaces(self.DEFAULT_NSMAPS)

128

129 def _register_namespaces(self, mapping: Dict[str, str]) -> None:

130 """Let the BeautifulSoup object know about namespaces encountered

131 while parsing the document.

132

133 This might be useful later on when creating CSS selectors.

134

135 This will track (almost) all namespaces, even ones that were

136 only in scope for part of the document. If two namespaces have

137 the same prefix, only the first one encountered will be

138 tracked. Un-prefixed namespaces are not tracked.

139

140 :param mapping: A dictionary mapping namespace prefixes to URIs.

141 """

142 assert self.soup is not None

143 for key, value in list(mapping.items()):

144 # This is 'if key' and not 'if key is not None' because we

145 # don't track un-prefixed namespaces. Soupselect will

146 # treat an un-prefixed namespace as the default, which

147 # causes confusion in some cases.

148 if key and key not in self.soup._namespaces:

149 # Let the BeautifulSoup object know about a new namespace.

150 # If there are multiple namespaces defined with the same

151 # prefix, the first one in the document takes precedence.

152 self.soup._namespaces[key] = value

153

154 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

155 """Find the default parser for the given encoding.

156

157 :return: Either a parser object or a class, which

158 will be instantiated with default arguments.

159 """

160 if self._default_parser is not None:

161 return self._default_parser

162 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)

163

164 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:

165 """Instantiate an appropriate parser for the given encoding.

166

167 :param encoding: A string.

168 :return: A parser object such as an `etree.XMLParser`.

169 """

170 # Use the default parser.

171 parser = self.default_parser(encoding)

172

173 if callable(parser):

174 # Instantiate the parser with default arguments

175 parser = parser(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)

176 return parser

177

178 def __init__(

179 self,

180 parser: Optional[etree.XMLParser] = None,

181 empty_element_tags: Optional[Set[str]] = None,

182 huge_tree: bool = False,

183 **kwargs: Any,

184 ):

185 # TODO: Issue a warning if parser is present but not a

186 # callable, since that means there's no way to create new

187 # parsers for different encodings.

188 self._default_parser = parser

189 self.soup = None

190 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

191 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]

192 if self.is_xml:

193 self.processing_instruction_class = XMLProcessingInstruction

194 else:

195 self.processing_instruction_class = ProcessingInstruction

196

197 if "attribute_dict_class" not in kwargs:

198 kwargs["attribute_dict_class"] = XMLAttributeDict

199 self.huge_tree = huge_tree

200

201 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)

202

203 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:

204 # Split the namespace URL out of a fully-qualified lxml tag

205 # name. Copied from lxml's src/lxml/sax.py.

206 if tag[0] == "{" and "}" in tag:

207 namespace, name = tag[1:].split("}", 1)

208 return (namespace, name)

209 return (None, tag)

210

211 def prepare_markup(

212 self,

213 markup: _RawMarkup,

214 user_specified_encoding: Optional[_Encoding] = None,

215 document_declared_encoding: Optional[_Encoding] = None,

216 exclude_encodings: Optional[_Encodings] = None,

217 ) -> Iterable[

218 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]

219 ]:

220 """Run any preliminary steps necessary to make incoming markup

221 acceptable to the parser.

222

223 lxml really wants to get a bytestring and convert it to

224 Unicode itself. So instead of using UnicodeDammit to convert

225 the bytestring to Unicode using different encodings, this

226 implementation uses EncodingDetector to iterate over the

227 encodings, and tell lxml to try to parse the document as each

228 one in turn.

229

230 :param markup: Some markup -- hopefully a bytestring.

231 :param user_specified_encoding: The user asked to try this encoding.

232 :param document_declared_encoding: The markup itself claims to be

233 in this encoding.

234 :param exclude_encodings: The user asked _not_ to try any of

235 these encodings.

236

237 :yield: A series of 4-tuples: (markup, encoding, declared encoding,

238 has undergone character replacement)

239

240 Each 4-tuple represents a strategy for converting the

241 document to Unicode and parsing it. Each strategy will be tried

242 in turn.

243 """

244 if not self.is_xml:

245 # We're in HTML mode, so if we're given XML, that's worth

246 # noting.

247 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)

248

249 if isinstance(markup, str):

250 # We were given Unicode. Maybe lxml can parse Unicode on

251 # this system?

252

253 # TODO: This is a workaround for

254 # https://bugs.launchpad.net/lxml/+bug/1948551.

255 # We can remove it once the upstream issue is fixed.

256 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":

257 markup = markup[1:]

258 yield markup, None, document_declared_encoding, False

259

260 if isinstance(markup, str):

261 # No, apparently not. Convert the Unicode to UTF-8 and

262 # tell lxml to parse it as UTF-8.

263 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)

264

265 # Since the document was Unicode in the first place, there

266 # is no need to try any more strategies; we know this will

267 # work.

268 return

269

270 known_definite_encodings: List[_Encoding] = []

271 if user_specified_encoding:

272 # This was provided by the end-user; treat it as a known

273 # definite encoding per the algorithm laid out in the

274 # HTML5 spec. (See the EncodingDetector class for

275 # details.)

276 known_definite_encodings.append(user_specified_encoding)

277

278 user_encodings: List[_Encoding] = []

279 if document_declared_encoding:

280 # This was found in the document; treat it as a slightly

281 # lower-priority user encoding.

282 user_encodings.append(document_declared_encoding)

283

284 detector = EncodingDetector(

285 markup,

286 known_definite_encodings=known_definite_encodings,

287 user_encodings=user_encodings,

288 is_html=not self.is_xml,

289 exclude_encodings=exclude_encodings,

290 )

291 for encoding in detector.encodings:

292 yield (detector.markup, encoding, document_declared_encoding, False)

293

294 def feed(self, markup: _RawMarkup) -> None:

295 io: Union[BytesIO, StringIO]

296 if isinstance(markup, bytes):

297 io = BytesIO(markup)

298 elif isinstance(markup, str):

299 io = StringIO(markup)

300

301 # initialize_soup is called before feed, so we know this

302 # is not None.

303 assert self.soup is not None

304

305 # Call feed() at least once, even if the markup is empty,

306 # or the parser won't be initialized.

307 data = io.read(self.CHUNK_SIZE)

308 try:

309 self.parser = self.parser_for(self.soup.original_encoding)

310 self.parser.feed(data)

311 while len(data) != 0:

312 # Now call feed() on the rest of the data, chunk by chunk.

313 data = io.read(self.CHUNK_SIZE)

314 if len(data) != 0:

315 self.parser.feed(data)

316 self.parser.close()

317 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

318 raise ParserRejectedMarkup(e)

319

320 def close(self) -> None:

321 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

322

323 def start(

324 self,

325 tag: str | bytes,

326 attrib: Dict[str | bytes, str | bytes],

327 nsmap: _NamespaceMapping = {},

328 ) -> None:

329 # This is called by lxml code as a result of calling

330 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()

331 # is called.

332 assert self.soup is not None

333 assert isinstance(tag, str)

334

335 # We need to recreate the attribute dict for three

336 # reasons. First, for type checking, so we can assert there

337 # are no bytestrings in the keys or values. Second, because we

338 # need a mutable dict--lxml might send us an immutable

339 # dictproxy. Third, so we can handle namespaced attribute

340 # names by converting the keys to NamespacedAttributes.

341 new_attrib: Dict[Union[str, NamespacedAttribute], str] = (

342 self.attribute_dict_class()

343 )

344 for k, v in attrib.items():

345 assert isinstance(k, str)

346 assert isinstance(v, str)

347 new_attrib[k] = v

348

349 nsprefix: Optional[_NamespacePrefix] = None

350 namespace: Optional[_NamespaceURL] = None

351 # Invert each namespace map as it comes in.

352 if len(nsmap) == 0 and len(self.nsmaps) > 1:

353 # There are no new namespaces for this tag, but

354 # non-default namespaces are in play, so we need a

355 # separate tag stack to know when they end.

356 self.nsmaps.append(None)

357 elif len(nsmap) > 0:

358 # A new namespace mapping has come into play.

359

360 # First, Let the BeautifulSoup object know about it.

361 self._register_namespaces(nsmap)

362

363 # Then, add it to our running list of inverted namespace

364 # mappings.

365 self.nsmaps.append(_invert(nsmap))

366

367 # The currently active namespace prefixes have

368 # changed. Calculate the new mapping so it can be stored

369 # with all Tag objects created while these prefixes are in

370 # scope.

371 current_mapping = dict(self.active_namespace_prefixes[-1])

372 current_mapping.update(nsmap)

373

374 # We should not track un-prefixed namespaces as we can only hold one

375 # and it will be recognized as the default namespace by soupsieve,

376 # which may be confusing in some situations.

377 if "" in current_mapping:

378 del current_mapping[""]

379 self.active_namespace_prefixes.append(current_mapping)

380

381 # Also treat the namespace mapping as a set of attributes on the

382 # tag, so we can recreate it later.

383 for prefix, namespace in list(nsmap.items()):

384 attribute = NamespacedAttribute(

385 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"

386 )

387 new_attrib[attribute] = namespace

388

389 # Namespaces are in play. Find any attributes that came in

390 # from lxml with namespaces attached to their names, and

391 # turn then into NamespacedAttribute objects.

392 final_attrib: AttributeDict = self.attribute_dict_class()

393 for attr, value in list(new_attrib.items()):

394 namespace, attr = self._getNsTag(attr)

395 if namespace is None:

396 final_attrib[attr] = value

397 else:

398 nsprefix = self._prefix_for_namespace(namespace)

399 attr = NamespacedAttribute(nsprefix, attr, namespace)

400 final_attrib[attr] = value

401

402 namespace, tag = self._getNsTag(tag)

403 nsprefix = self._prefix_for_namespace(namespace)

404 self.soup.handle_starttag(

405 tag,

406 namespace,

407 nsprefix,

408 final_attrib,

409 namespaces=self.active_namespace_prefixes[-1],

410 )

411

412 def _prefix_for_namespace(

413 self, namespace: Optional[_NamespaceURL]

414 ) -> Optional[_NamespacePrefix]:

415 """Find the currently active prefix for the given namespace."""

416 if namespace is None:

417 return None

418 for inverted_nsmap in reversed(self.nsmaps):

419 if inverted_nsmap is not None and namespace in inverted_nsmap:

420 return inverted_nsmap[namespace]

421 return None

422

423 def end(self, tag: str | bytes) -> None:

424 assert self.soup is not None

425 assert isinstance(tag, str)

426 self.soup.endData()

427 namespace, tag = self._getNsTag(tag)

428 nsprefix = None

429 if namespace is not None:

430 for inverted_nsmap in reversed(self.nsmaps):

431 if inverted_nsmap is not None and namespace in inverted_nsmap:

432 nsprefix = inverted_nsmap[namespace]

433 break

434 self.soup.handle_endtag(tag, nsprefix)

435 if len(self.nsmaps) > 1:

436 # This tag, or one of its parents, introduced a namespace

437 # mapping, so pop it off the stack.

438 out_of_scope_nsmap = self.nsmaps.pop()

439

440 if out_of_scope_nsmap is not None:

441 # This tag introduced a namespace mapping which is no

442 # longer in scope. Recalculate the currently active

443 # namespace prefixes.

444 self.active_namespace_prefixes.pop()

445

446 def pi(self, target: str, data: str) -> None:

447 assert self.soup is not None

448 self.soup.endData()

449 data = target + " " + data

450 self.soup.handle_data(data)

451 self.soup.endData(self.processing_instruction_class)

452

453 def data(self, data: str | bytes) -> None:

454 assert self.soup is not None

455 assert isinstance(data, str)

456 self.soup.handle_data(data)

457

458 def doctype(self, name: str, pubid: str, system: str) -> None:

459 assert self.soup is not None

460 self.soup.endData()

461 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)

462 self.soup.handle_data(doctype_string)

463 self.soup.endData(containerClass=Doctype)

464

465 def comment(self, text: str | bytes) -> None:

466 "Handle comments as Comment objects."

467 assert self.soup is not None

468 assert isinstance(text, str)

469 self.soup.endData()

470 self.soup.handle_data(text)

471 self.soup.endData(Comment)

472

473 def test_fragment_to_document(self, fragment: str) -> str:

474 """See `TreeBuilder`."""

475 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment

476

477

478class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

479 NAME: str = LXML

480 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]

481

482 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]

483 is_xml: bool = False

484

485 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:

486 return etree.HTMLParser

487

488 def feed(self, markup: _RawMarkup) -> None:

489 # We know self.soup is set by the time feed() is called.

490 assert self.soup is not None

491 encoding = self.soup.original_encoding

492 try:

493 self.parser = self.parser_for(encoding)

494 self.parser.feed(markup)

495 self.parser.close()

496 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

497 raise ParserRejectedMarkup(e)

498

499 def test_fragment_to_document(self, fragment: str) -> str:

500 """See `TreeBuilder`."""

501 return "<html><body>%s</body></html>" % fragment

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%

207 statements