Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_html5lib.py: 4%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

242 statements  

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4__all__ = [ 

5 "HTML5TreeBuilder", 

6] 

7 

8from typing import ( 

9 Any, 

10 cast, 

11 Dict, 

12 Iterable, 

13 Optional, 

14 Sequence, 

15 TYPE_CHECKING, 

16 Tuple, 

17 Union, 

18) 

19from typing_extensions import TypeAlias 

20from bs4._typing import ( 

21 _AttributeValue, 

22 _AttributeValues, 

23 _Encoding, 

24 _Encodings, 

25 _NamespaceURL, 

26 _RawMarkup, 

27) 

28 

29import warnings 

30from bs4.builder import ( 

31 DetectsXMLParsedAsHTML, 

32 PERMISSIVE, 

33 HTML, 

34 HTML_5, 

35 HTMLTreeBuilder, 

36) 

37from bs4.element import ( 

38 NamespacedAttribute, 

39 PageElement, 

40 nonwhitespace_re, 

41) 

42import html5lib 

43from html5lib.constants import ( 

44 namespaces, 

45) 

46from bs4.element import ( 

47 Comment, 

48 Doctype, 

49 NavigableString, 

50 Tag, 

51) 

52 

53if TYPE_CHECKING: 

54 from bs4 import BeautifulSoup 

55 

56from html5lib.treebuilders import base as treebuilder_base 

57 

58 

59class HTML5TreeBuilder(HTMLTreeBuilder): 

60 """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to 

61 build a tree. 

62 

63 Note that `HTML5TreeBuilder` does not support some common HTML 

64 `TreeBuilder` features. Some of these features could theoretically 

65 be implemented, but at the very least it's quite difficult, 

66 because html5lib moves the parse tree around as it's being built. 

67 

68 Specifically: 

69 

70 * This `TreeBuilder` doesn't use different subclasses of 

71 `NavigableString` (e.g. `Script`) based on the name of the tag 

72 in which the string was found. 

73 * You can't use a `SoupStrainer` to parse only part of a document. 

74 """ 

75 

76 NAME: str = "html5lib" 

77 

78 features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML] 

79 

80 #: html5lib can tell us which line number and position in the 

81 #: original file is the source of an element. 

82 TRACKS_LINE_NUMBERS: bool = True 

83 

84 underlying_builder: "TreeBuilderForHtml5lib" #: :meta private: 

85 user_specified_encoding: Optional[_Encoding] 

86 

87 def prepare_markup( 

88 self, 

89 markup: _RawMarkup, 

90 user_specified_encoding: Optional[_Encoding] = None, 

91 document_declared_encoding: Optional[_Encoding] = None, 

92 exclude_encodings: Optional[_Encodings] = None, 

93 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: 

94 # Store the user-specified encoding for use later on. 

95 self.user_specified_encoding = user_specified_encoding 

96 

97 # document_declared_encoding and exclude_encodings aren't used 

98 # ATM because the html5lib TreeBuilder doesn't use 

99 # UnicodeDammit. 

100 for variable, name in ( 

101 (document_declared_encoding, "document_declared_encoding"), 

102 (exclude_encodings, "exclude_encodings"), 

103 ): 

104 if variable: 

105 warnings.warn( 

106 f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.", 

107 stacklevel=3, 

108 ) 

109 

110 # html5lib only parses HTML, so if it's given XML that's worth 

111 # noting. 

112 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) 

113 

114 yield (markup, None, None, False) 

115 

116 # These methods are defined by Beautiful Soup. 

117 def feed(self, markup: _RawMarkup) -> None: 

118 """Run some incoming markup through some parsing process, 

119 populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`. 

120 """ 

121 if self.soup is not None and self.soup.parse_only is not None: 

122 warnings.warn( 

123 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", 

124 stacklevel=4, 

125 ) 

126 

127 # self.underlying_builder is probably None now, but it'll be set 

128 # when html5lib calls self.create_treebuilder(). 

129 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 

130 assert self.underlying_builder is not None 

131 self.underlying_builder.parser = parser 

132 extra_kwargs = dict() 

133 if not isinstance(markup, str): 

134 # kwargs, specifically override_encoding, will eventually 

135 # be passed in to html5lib's 

136 # HTMLBinaryInputStream.__init__. 

137 extra_kwargs["override_encoding"] = self.user_specified_encoding 

138 

139 doc = parser.parse(markup, **extra_kwargs) 

140 

141 # Set the character encoding detected by the tokenizer. 

142 if isinstance(markup, str): 

143 # We need to special-case this because html5lib sets 

144 # charEncoding to UTF-8 if it gets Unicode input. 

145 doc.original_encoding = None 

146 else: 

147 original_encoding = parser.tokenizer.stream.charEncoding[0] 

148 # The encoding is an html5lib Encoding object. We want to 

149 # use a string for compatibility with other tree builders. 

150 original_encoding = original_encoding.name 

151 doc.original_encoding = original_encoding 

152 self.underlying_builder.parser = None 

153 

154 def create_treebuilder( 

155 self, namespaceHTMLElements: bool 

156 ) -> "TreeBuilderForHtml5lib": 

157 """Called by html5lib to instantiate the kind of class it 

158 calls a 'TreeBuilder'. 

159 

160 :param namespaceHTMLElements: Whether or not to namespace HTML elements. 

161 

162 :meta private: 

163 """ 

164 self.underlying_builder = TreeBuilderForHtml5lib( 

165 namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers 

166 ) 

167 return self.underlying_builder 

168 

169 def test_fragment_to_document(self, fragment: str) -> str: 

170 """See `TreeBuilder`.""" 

171 return "<html><head></head><body>%s</body></html>" % fragment 

172 

173 

174class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 

175 soup: "BeautifulSoup" #: :meta private: 

176 parser: Optional[html5lib.HTMLParser] #: :meta private: 

177 

178 def __init__( 

179 self, 

180 namespaceHTMLElements: bool, 

181 soup: Optional["BeautifulSoup"] = None, 

182 store_line_numbers: bool = True, 

183 **kwargs: Any, 

184 ): 

185 if soup: 

186 self.soup = soup 

187 else: 

188 warnings.warn( 

189 "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.", 

190 DeprecationWarning, 

191 stacklevel=2, 

192 ) 

193 from bs4 import BeautifulSoup 

194 

195 # TODO: Why is the parser 'html.parser' here? Using 

196 # html5lib doesn't cause an infinite loop and is more 

197 # accurate. Best to get rid of this entire section, I think. 

198 self.soup = BeautifulSoup( 

199 "", "html.parser", store_line_numbers=store_line_numbers, **kwargs 

200 ) 

201 # TODO: What are **kwargs exactly? Should they be passed in 

202 # here in addition to/instead of being passed to the BeautifulSoup 

203 # constructor? 

204 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 

205 

206 # This will be set later to a real html5lib HTMLParser object, 

207 # which we can use to track the current line number. 

208 self.parser = None 

209 self.store_line_numbers = store_line_numbers 

210 

211 def documentClass(self) -> "Element": 

212 self.soup.reset() 

213 return Element(self.soup, self.soup, None) 

214 

215 def insertDoctype(self, token: Dict[str, Any]) -> None: 

216 name: str = cast(str, token["name"]) 

217 publicId: Optional[str] = cast(Optional[str], token["publicId"]) 

218 systemId: Optional[str] = cast(Optional[str], token["systemId"]) 

219 

220 doctype = Doctype.for_name_and_ids(name, publicId, systemId) 

221 self.soup.object_was_parsed(doctype) 

222 

223 def elementClass(self, name: str, namespace: str) -> "Element": 

224 sourceline: Optional[int] = None 

225 sourcepos: Optional[int] = None 

226 if self.parser is not None and self.store_line_numbers: 

227 # This represents the point immediately after the end of the 

228 # tag. We don't know when the tag started, but we do know 

229 # where it ended -- the character just before this one. 

230 sourceline, sourcepos = self.parser.tokenizer.stream.position() 

231 assert sourcepos is not None 

232 sourcepos = sourcepos - 1 

233 tag = self.soup.new_tag( 

234 name, namespace, sourceline=sourceline, sourcepos=sourcepos 

235 ) 

236 

237 return Element(tag, self.soup, namespace) 

238 

239 def commentClass(self, data: str) -> "TextNode": 

240 return TextNode(Comment(data), self.soup) 

241 

242 def fragmentClass(self) -> "Element": 

243 """This is only used by html5lib HTMLParser.parseFragment(), 

244 which is never used by Beautiful Soup, only by the html5lib 

245 unit tests. Since we don't currently hook into those tests, 

246 the implementation is left blank. 

247 """ 

248 raise NotImplementedError() 

249 

250 def getFragment(self) -> "Element": 

251 """This is only used by the html5lib unit tests. Since we 

252 don't currently hook into those tests, the implementation is 

253 left blank. 

254 """ 

255 raise NotImplementedError() 

256 

257 def appendChild(self, node: "Element") -> None: 

258 # TODO: This code is not covered by the BS4 tests, and 

259 # apparently not triggered by the html5lib test suite either. 

260 # But it doesn't seem test-specific and there are calls to it 

261 # (or a method with the same name) all over html5lib, so I'm 

262 # leaving the implementation in place rather than replacing it 

263 # with NotImplementedError() 

264 self.soup.append(node.element) 

265 

266 def getDocument(self) -> "BeautifulSoup": 

267 return self.soup 

268 

269 def testSerializer(self, element: "Element") -> str: 

270 """This is only used by the html5lib unit tests. Since we 

271 don't currently hook into those tests, the implementation is 

272 left blank. 

273 """ 

274 raise NotImplementedError() 

275 

276 

277class AttrList(object): 

278 """Represents a Tag's attributes in a way compatible with html5lib.""" 

279 

280 element: Tag 

281 attrs: _AttributeValues 

282 

283 def __init__(self, element: Tag): 

284 self.element = element 

285 self.attrs = dict(self.element.attrs) 

286 

287 def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]: 

288 return list(self.attrs.items()).__iter__() 

289 

290 def __setitem__(self, name: str, value: _AttributeValue) -> None: 

291 # If this attribute is a multi-valued attribute for this element, 

292 # turn its value into a list. 

293 list_attr = self.element.cdata_list_attributes or {} 

294 if name in list_attr.get("*", []) or ( 

295 self.element.name in list_attr 

296 and name in list_attr.get(self.element.name, []) 

297 ): 

298 # A node that is being cloned may have already undergone 

299 # this procedure. Check for this and skip it. 

300 if not isinstance(value, list): 

301 assert isinstance(value, str) 

302 value = self.element.attribute_value_list_class( 

303 nonwhitespace_re.findall(value) 

304 ) 

305 self.element[name] = value 

306 

307 def items(self) -> Iterable[Tuple[str, _AttributeValue]]: 

308 return list(self.attrs.items()) 

309 

310 def keys(self) -> Iterable[str]: 

311 return list(self.attrs.keys()) 

312 

313 def __len__(self) -> int: 

314 return len(self.attrs) 

315 

316 def __getitem__(self, name: str) -> _AttributeValue: 

317 return self.attrs[name] 

318 

319 def __contains__(self, name: str) -> bool: 

320 return name in list(self.attrs.keys()) 

321 

322 

323class BeautifulSoupNode(treebuilder_base.Node): 

324 element: PageElement 

325 soup: "BeautifulSoup" 

326 namespace: Optional[_NamespaceURL] 

327 

328 @property 

329 def nodeType(self) -> int: 

330 """Return the html5lib constant corresponding to the type of 

331 the underlying DOM object. 

332 

333 NOTE: This property is only accessed by the html5lib test 

334 suite, not by Beautiful Soup proper. 

335 """ 

336 raise NotImplementedError() 

337 

338 # TODO-TYPING: typeshed stubs are incorrect about this; 

339 # cloneNode returns a new Node, not None. 

340 def cloneNode(self) -> treebuilder_base.Node: 

341 raise NotImplementedError() 

342 

343 

344class Element(BeautifulSoupNode): 

345 element: Tag 

346 namespace: Optional[_NamespaceURL] 

347 

348 def __init__( 

349 self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL] 

350 ): 

351 treebuilder_base.Node.__init__(self, element.name) 

352 self.element = element 

353 self.soup = soup 

354 self.namespace = namespace 

355 

356 def appendChild(self, node: "BeautifulSoupNode") -> None: 

357 string_child: Optional[NavigableString] = None 

358 child: PageElement 

359 if type(node.element) is NavigableString: 

360 string_child = child = node.element 

361 else: 

362 child = node.element 

363 node.parent = self 

364 

365 if ( 

366 child is not None 

367 and child.parent is not None 

368 and not isinstance(child, str) 

369 ): 

370 node.element.extract() 

371 

372 if ( 

373 string_child is not None 

374 and self.element.contents 

375 and type(self.element.contents[-1]) is NavigableString 

376 ): 

377 # We are appending a string onto another string. 

378 # TODO This has O(n^2) performance, for input like 

379 # "a</a>a</a>a</a>..." 

380 old_element = self.element.contents[-1] 

381 new_element = self.soup.new_string(old_element + string_child) 

382 old_element.replace_with(new_element) 

383 self.soup._most_recent_element = new_element 

384 else: 

385 if isinstance(node, str): 

386 # Create a brand new NavigableString from this string. 

387 child = self.soup.new_string(node) 

388 

389 # Tell Beautiful Soup to act as if it parsed this element 

390 # immediately after the parent's last descendant. (Or 

391 # immediately after the parent, if it has no children.) 

392 if self.element.contents: 

393 most_recent_element = self.element._last_descendant(False) 

394 elif self.element.next_element is not None: 

395 # Something from further ahead in the parse tree is 

396 # being inserted into this earlier element. This is 

397 # very annoying because it means an expensive search 

398 # for the last element in the tree. 

399 most_recent_element = self.soup._last_descendant() 

400 else: 

401 most_recent_element = self.element 

402 

403 self.soup.object_was_parsed( 

404 child, parent=self.element, most_recent_element=most_recent_element 

405 ) 

406 

407 def getAttributes(self) -> AttrList: 

408 if isinstance(self.element, Comment): 

409 return {} 

410 return AttrList(self.element) 

411 

412 # An HTML5lib attribute name may either be a single string, 

413 # or a tuple (namespace, name). 

414 _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]] 

415 # Now we can define the type this method accepts as a dictionary 

416 # mapping those attribute names to single string values. 

417 _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str] 

418 

419 def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None: 

420 if attributes is not None and len(attributes) > 0: 

421 # Replace any namespaced attributes with 

422 # NamespacedAttribute objects. 

423 for name, value in list(attributes.items()): 

424 if isinstance(name, tuple): 

425 new_name = NamespacedAttribute(*name) 

426 del attributes[name] 

427 attributes[new_name] = value 

428 

429 # We can now cast attributes to the type of Dict 

430 # used by Beautiful Soup. 

431 normalized_attributes = cast(_AttributeValues, attributes) 

432 

433 # Values for tags like 'class' came in as single strings; 

434 # replace them with lists of strings as appropriate. 

435 self.soup.builder._replace_cdata_list_attribute_values( 

436 self.name, normalized_attributes 

437 ) 

438 

439 # Then set the attributes on the Tag associated with this 

440 # BeautifulSoupNode. 

441 for name, value_or_values in list(normalized_attributes.items()): 

442 self.element[name] = value_or_values 

443 

444 # The attributes may contain variables that need substitution. 

445 # Call set_up_substitutions manually. 

446 # 

447 # The Tag constructor called this method when the Tag was created, 

448 # but we just set/changed the attributes, so call it again. 

449 self.soup.builder.set_up_substitutions(self.element) 

450 

451 attributes = property(getAttributes, setAttributes) 

452 

453 def insertText( 

454 self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None 

455 ) -> None: 

456 text = TextNode(self.soup.new_string(data), self.soup) 

457 if insertBefore: 

458 self.insertBefore(text, insertBefore) 

459 else: 

460 self.appendChild(text) 

461 

462 def insertBefore( 

463 self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode" 

464 ) -> None: 

465 index = self.element.index(refNode.element) 

466 if ( 

467 type(node.element) is NavigableString 

468 and self.element.contents 

469 and type(self.element.contents[index - 1]) is NavigableString 

470 ): 

471 # (See comments in appendChild) 

472 old_node = self.element.contents[index - 1] 

473 assert type(old_node) is NavigableString 

474 new_str = self.soup.new_string(old_node + node.element) 

475 old_node.replace_with(new_str) 

476 else: 

477 self.element.insert(index, node.element) 

478 node.parent = self 

479 

480 def removeChild(self, node: "Element") -> None: 

481 node.element.extract() 

482 

483 def reparentChildren(self, new_parent: "Element") -> None: 

484 """Move all of this tag's children into another tag.""" 

485 # print("MOVE", self.element.contents) 

486 # print("FROM", self.element) 

487 # print("TO", new_parent.element) 

488 

489 element = self.element 

490 new_parent_element = new_parent.element 

491 # Determine what this tag's next_element will be once all the children 

492 # are removed. 

493 final_next_element = element.next_sibling 

494 

495 new_parents_last_descendant = new_parent_element._last_descendant(False, False) 

496 if len(new_parent_element.contents) > 0: 

497 # The new parent already contains children. We will be 

498 # appending this tag's children to the end. 

499 

500 # We can make this assertion since we know new_parent has 

501 # children. 

502 assert new_parents_last_descendant is not None 

503 new_parents_last_child = new_parent_element.contents[-1] 

504 new_parents_last_descendant_next_element = ( 

505 new_parents_last_descendant.next_element 

506 ) 

507 else: 

508 # The new parent contains no children. 

509 new_parents_last_child = None 

510 new_parents_last_descendant_next_element = new_parent_element.next_element 

511 

512 to_append = element.contents 

513 if len(to_append) > 0: 

514 # Set the first child's previous_element and previous_sibling 

515 # to elements within the new parent 

516 first_child = to_append[0] 

517 if new_parents_last_descendant is not None: 

518 first_child.previous_element = new_parents_last_descendant 

519 else: 

520 first_child.previous_element = new_parent_element 

521 first_child.previous_sibling = new_parents_last_child 

522 if new_parents_last_descendant is not None: 

523 new_parents_last_descendant.next_element = first_child 

524 else: 

525 new_parent_element.next_element = first_child 

526 if new_parents_last_child is not None: 

527 new_parents_last_child.next_sibling = first_child 

528 

529 # Find the very last element being moved. It is now the 

530 # parent's last descendant. It has no .next_sibling and 

531 # its .next_element is whatever the previous last 

532 # descendant had. 

533 last_childs_last_descendant = to_append[-1]._last_descendant( 

534 is_initialized=False, accept_self=True 

535 ) 

536 

537 # Since we passed accept_self=True into _last_descendant, 

538 # there's no possibility that the result is None. 

539 assert last_childs_last_descendant is not None 

540 last_childs_last_descendant.next_element = ( 

541 new_parents_last_descendant_next_element 

542 ) 

543 if new_parents_last_descendant_next_element is not None: 

544 # TODO-COVERAGE: This code has no test coverage and 

545 # I'm not sure how to get html5lib to go through this 

546 # path, but it's just the other side of the previous 

547 # line. 

548 new_parents_last_descendant_next_element.previous_element = ( 

549 last_childs_last_descendant 

550 ) 

551 last_childs_last_descendant.next_sibling = None 

552 

553 for child in to_append: 

554 child.parent = new_parent_element 

555 new_parent_element.contents.append(child) 

556 

557 # Now that this element has no children, change its .next_element. 

558 element.contents = [] 

559 element.next_element = final_next_element 

560 

561 # print("DONE WITH MOVE") 

562 # print("FROM", self.element) 

563 # print("TO", new_parent_element) 

564 

565 # TODO-TYPING: typeshed stubs are incorrect about this; 

566 # hasContent returns a boolean, not None. 

567 def hasContent(self) -> bool: 

568 return len(self.element.contents) > 0 

569 

570 # TODO-TYPING: typeshed stubs are incorrect about this; 

571 # cloneNode returns a new Node, not None. 

572 def cloneNode(self) -> treebuilder_base.Node: 

573 tag = self.soup.new_tag(self.element.name, self.namespace) 

574 node = Element(tag, self.soup, self.namespace) 

575 for key, value in self.attributes: 

576 node.attributes[key] = value 

577 return node 

578 

579 def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]: 

580 if self.namespace is None: 

581 return namespaces["html"], self.name 

582 else: 

583 return self.namespace, self.name 

584 

585 nameTuple = property(getNameTuple) 

586 

587 

588class TextNode(BeautifulSoupNode): 

589 element: NavigableString 

590 

591 def __init__(self, element: NavigableString, soup: "BeautifulSoup"): 

592 treebuilder_base.Node.__init__(self, None) 

593 self.element = element 

594 self.soup = soup