Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_html5lib.py: 4%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

254 statements  

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4__all__ = [ 

5 "HTML5TreeBuilder", 

6] 

7 

8from typing import ( 

9 Any, 

10 cast, 

11 Dict, 

12 Iterable, 

13 Optional, 

14 Sequence, 

15 TYPE_CHECKING, 

16 Tuple, 

17 Union, 

18) 

19from typing_extensions import TypeAlias 

20from bs4._typing import ( 

21 _AttributeValue, 

22 _AttributeValues, 

23 _Encoding, 

24 _Encodings, 

25 _NamespaceURL, 

26 _RawMarkup, 

27) 

28 

29import warnings 

30from bs4.builder import ( 

31 DetectsXMLParsedAsHTML, 

32 PERMISSIVE, 

33 HTML, 

34 HTML_5, 

35 HTMLTreeBuilder, 

36) 

37from bs4.element import ( 

38 NamespacedAttribute, 

39 PageElement, 

40 nonwhitespace_re, 

41) 

42import html5lib 

43from html5lib.constants import ( 

44 namespaces, 

45) 

46from bs4.element import ( 

47 Comment, 

48 Doctype, 

49 NavigableString, 

50 Tag, 

51) 

52 

53if TYPE_CHECKING: 

54 from bs4 import BeautifulSoup 

55 

56from html5lib.treebuilders import base as treebuilder_base 

57 

58 

59class HTML5TreeBuilder(HTMLTreeBuilder): 

60 """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to 

61 build a tree. 

62 

63 Note that `HTML5TreeBuilder` does not support some common HTML 

64 `TreeBuilder` features. Some of these features could theoretically 

65 be implemented, but at the very least it's quite difficult, 

66 because html5lib moves the parse tree around as it's being built. 

67 

68 Specifically: 

69 

70 * This `TreeBuilder` doesn't use different subclasses of 

71 `NavigableString` (e.g. `Script`) based on the name of the tag 

72 in which the string was found. 

73 * You can't use a `SoupStrainer` to parse only part of a document. 

74 """ 

75 

76 NAME: str = "html5lib" 

77 

78 features: Iterable[str] = [NAME, PERMISSIVE, HTML_5, HTML] 

79 

80 #: html5lib can tell us which line number and position in the 

81 #: original file is the source of an element. 

82 TRACKS_LINE_NUMBERS: bool = True 

83 

84 underlying_builder: "TreeBuilderForHtml5lib" #: :meta private: 

85 user_specified_encoding: Optional[_Encoding] 

86 

87 def prepare_markup( 

88 self, 

89 markup: _RawMarkup, 

90 user_specified_encoding: Optional[_Encoding] = None, 

91 document_declared_encoding: Optional[_Encoding] = None, 

92 exclude_encodings: Optional[_Encodings] = None, 

93 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: 

94 # Store the user-specified encoding for use later on. 

95 self.user_specified_encoding = user_specified_encoding 

96 

97 # document_declared_encoding and exclude_encodings aren't used 

98 # ATM because the html5lib TreeBuilder doesn't use 

99 # UnicodeDammit. 

100 for variable, name in ( 

101 (document_declared_encoding, "document_declared_encoding"), 

102 (exclude_encodings, "exclude_encodings"), 

103 ): 

104 if variable: 

105 warnings.warn( 

106 f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.", 

107 stacklevel=3, 

108 ) 

109 

110 # html5lib only parses HTML, so if it's given XML that's worth 

111 # noting. 

112 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) 

113 

114 yield (markup, None, None, False) 

115 

116 # These methods are defined by Beautiful Soup. 

117 def feed(self, markup: _RawMarkup) -> None: 

118 """Run some incoming markup through some parsing process, 

119 populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`. 

120 """ 

121 if self.soup is not None and self.soup.parse_only is not None: 

122 warnings.warn( 

123 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", 

124 stacklevel=4, 

125 ) 

126 

127 # self.underlying_builder is probably None now, but it'll be set 

128 # when html5lib calls self.create_treebuilder(). 

129 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 

130 assert self.underlying_builder is not None 

131 self.underlying_builder.parser = parser 

132 extra_kwargs = dict() 

133 if not isinstance(markup, str): 

134 # kwargs, specifically override_encoding, will eventually 

135 # be passed in to html5lib's 

136 # HTMLBinaryInputStream.__init__. 

137 extra_kwargs["override_encoding"] = self.user_specified_encoding 

138 

139 doc = parser.parse(markup, **extra_kwargs) # type:ignore 

140 

141 # Set the character encoding detected by the tokenizer. 

142 if isinstance(markup, str): 

143 # We need to special-case this because html5lib sets 

144 # charEncoding to UTF-8 if it gets Unicode input. 

145 doc.original_encoding = None 

146 else: 

147 original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore 

148 # The encoding is an html5lib Encoding object. We want to 

149 # use a string for compatibility with other tree builders. 

150 original_encoding = original_encoding.name 

151 doc.original_encoding = original_encoding 

152 self.underlying_builder.parser = None 

153 

154 def create_treebuilder( 

155 self, namespaceHTMLElements: bool 

156 ) -> "TreeBuilderForHtml5lib": 

157 """Called by html5lib to instantiate the kind of class it 

158 calls a 'TreeBuilder'. 

159 

160 :param namespaceHTMLElements: Whether or not to namespace HTML elements. 

161 

162 :meta private: 

163 """ 

164 self.underlying_builder = TreeBuilderForHtml5lib( 

165 namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers 

166 ) 

167 return self.underlying_builder 

168 

169 def test_fragment_to_document(self, fragment: str) -> str: 

170 """See `TreeBuilder`.""" 

171 return "<html><head></head><body>%s</body></html>" % fragment 

172 

173 

174class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 

175 soup: "BeautifulSoup" #: :meta private: 

176 parser: Optional[html5lib.HTMLParser] #: :meta private: 

177 

178 def __init__( 

179 self, 

180 namespaceHTMLElements: bool, 

181 soup: Optional["BeautifulSoup"] = None, 

182 store_line_numbers: bool = True, 

183 **kwargs: Any, 

184 ): 

185 if soup: 

186 self.soup = soup 

187 else: 

188 warnings.warn( 

189 "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.", 

190 DeprecationWarning, 

191 stacklevel=2, 

192 ) 

193 from bs4 import BeautifulSoup 

194 

195 # TODO: Why is the parser 'html.parser' here? Using 

196 # html5lib doesn't cause an infinite loop and is more 

197 # accurate. Best to get rid of this entire section, I think. 

198 self.soup = BeautifulSoup( 

199 "", "html.parser", store_line_numbers=store_line_numbers, **kwargs 

200 ) 

201 # TODO: What are **kwargs exactly? Should they be passed in 

202 # here in addition to/instead of being passed to the BeautifulSoup 

203 # constructor? 

204 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 

205 

206 # This will be set later to a real html5lib HTMLParser object, 

207 # which we can use to track the current line number. 

208 self.parser = None 

209 self.store_line_numbers = store_line_numbers 

210 

211 def documentClass(self) -> "Element": 

212 self.soup.reset() 

213 return Element(self.soup, self.soup, None) 

214 

215 def insertDoctype(self, token: Dict[str, Any]) -> None: 

216 name: str = cast(str, token["name"]) 

217 publicId: Optional[str] = cast(Optional[str], token["publicId"]) 

218 systemId: Optional[str] = cast(Optional[str], token["systemId"]) 

219 

220 doctype = Doctype.for_name_and_ids(name, publicId, systemId) 

221 self.soup.object_was_parsed(doctype) 

222 

223 def elementClass(self, name: str, namespace: str) -> "Element": 

224 sourceline: Optional[int] = None 

225 sourcepos: Optional[int] = None 

226 if self.parser is not None and self.store_line_numbers: 

227 # This represents the point immediately after the end of the 

228 # tag. We don't know when the tag started, but we do know 

229 # where it ended -- the character just before this one. 

230 sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore 

231 assert sourcepos is not None 

232 sourcepos = sourcepos - 1 

233 tag = self.soup.new_tag( 

234 name, namespace, sourceline=sourceline, sourcepos=sourcepos 

235 ) 

236 

237 return Element(tag, self.soup, namespace) 

238 

239 def commentClass(self, data: str) -> "TextNode": 

240 return TextNode(Comment(data), self.soup) 

241 

242 def fragmentClass(self) -> "Element": 

243 """This is only used by html5lib HTMLParser.parseFragment(), 

244 which is never used by Beautiful Soup, only by the html5lib 

245 unit tests. Since we don't currently hook into those tests, 

246 the implementation is left blank. 

247 """ 

248 raise NotImplementedError() 

249 

250 def getFragment(self) -> "Element": 

251 """This is only used by the html5lib unit tests. Since we 

252 don't currently hook into those tests, the implementation is 

253 left blank. 

254 """ 

255 raise NotImplementedError() 

256 

257 def appendChild(self, node: "Element") -> None: 

258 # TODO: This code is not covered by the BS4 tests, and 

259 # apparently not triggered by the html5lib test suite either. 

260 # But it doesn't seem test-specific and there are calls to it 

261 # (or a method with the same name) all over html5lib, so I'm 

262 # leaving the implementation in place rather than replacing it 

263 # with NotImplementedError() 

264 self.soup.append(node.element) 

265 

266 def getDocument(self) -> "BeautifulSoup": 

267 return self.soup 

268 

269 def testSerializer(self, node: "Element") -> None: 

270 """This is only used by the html5lib unit tests. Since we 

271 don't currently hook into those tests, the implementation is 

272 left blank. 

273 """ 

274 raise NotImplementedError() 

275 

276 

277class AttrList(object): 

278 """Represents a Tag's attributes in a way compatible with html5lib.""" 

279 

280 element: Tag 

281 attrs: _AttributeValues 

282 

283 def __init__(self, element: Tag): 

284 self.element = element 

285 self.attrs = dict(self.element.attrs) 

286 

287 def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]: 

288 return list(self.attrs.items()).__iter__() 

289 

290 def __setitem__(self, name: str, value: _AttributeValue) -> None: 

291 # If this attribute is a multi-valued attribute for this element, 

292 # turn its value into a list. 

293 list_attr = self.element.cdata_list_attributes or {} 

294 if name in list_attr.get("*", []) or ( 

295 self.element.name in list_attr 

296 and name in list_attr.get(self.element.name, []) 

297 ): 

298 # A node that is being cloned may have already undergone 

299 # this procedure. Check for this and skip it. 

300 if not isinstance(value, list): 

301 assert isinstance(value, str) 

302 value = self.element.attribute_value_list_class( 

303 nonwhitespace_re.findall(value) 

304 ) 

305 self.element[name] = value 

306 

307 def items(self) -> Iterable[Tuple[str, _AttributeValue]]: 

308 return list(self.attrs.items()) 

309 

310 def keys(self) -> Iterable[str]: 

311 return list(self.attrs.keys()) 

312 

313 def __len__(self) -> int: 

314 return len(self.attrs) 

315 

316 def __getitem__(self, name: str) -> _AttributeValue: 

317 return self.attrs[name] 

318 

319 def __contains__(self, name: str) -> bool: 

320 return name in list(self.attrs.keys()) 

321 

322 

323class BeautifulSoupNode(treebuilder_base.Node): 

324 # A node can correspond to _either_ a Tag _or_ a NavigableString. 

325 tag: Optional[Tag] 

326 string: Optional[NavigableString] 

327 soup: "BeautifulSoup" 

328 namespace: Optional[_NamespaceURL] 

329 

330 @property 

331 def element(self) -> PageElement: 

332 assert self.tag is not None or self.string is not None 

333 if self.tag is not None: 

334 return self.tag 

335 else: 

336 assert self.string is not None 

337 return self.string 

338 

339 @property 

340 def nodeType(self) -> int: 

341 """Return the html5lib constant corresponding to the type of 

342 the underlying DOM object. 

343 

344 NOTE: This property is only accessed by the html5lib test 

345 suite, not by Beautiful Soup proper. 

346 """ 

347 raise NotImplementedError() 

348 

349 # TODO-TYPING: typeshed stubs are incorrect about this; 

350 # cloneNode returns a new Node, not None. 

351 def cloneNode(self) -> treebuilder_base.Node: # type:ignore 

352 raise NotImplementedError() 

353 

354 

355class Element(BeautifulSoupNode): 

356 namespace: Optional[_NamespaceURL] 

357 

358 def __init__( 

359 self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL] 

360 ): 

361 self.tag = element 

362 self.string = None 

363 self.soup = soup 

364 self.namespace = namespace 

365 treebuilder_base.Node.__init__(self, element.name) 

366 

367 def appendChild(self, node: "BeautifulSoupNode") -> None: 

368 string_child: Optional[NavigableString] = None 

369 child: PageElement 

370 if type(node.string) is NavigableString: 

371 # We check for NavigableString *only* because we want to avoid 

372 # joining PreformattedStrings, such as Comments, with nearby strings. 

373 string_child = child = node.string 

374 else: 

375 child = node.element 

376 node.parent = self 

377 

378 if ( 

379 child is not None 

380 and child.parent is not None 

381 and not isinstance(child, str) 

382 ): 

383 node.element.extract() 

384 

385 if ( 

386 string_child is not None 

387 and self.tag is not None and self.tag.contents 

388 and type(self.tag.contents[-1]) is NavigableString 

389 ): 

390 # We are appending a string onto another string. 

391 # TODO This has O(n^2) performance, for input like 

392 # "a</a>a</a>a</a>..." 

393 old_element = self.tag.contents[-1] 

394 new_element = self.soup.new_string(old_element + string_child) 

395 old_element.replace_with(new_element) 

396 self.soup._most_recent_element = new_element 

397 else: 

398 if isinstance(node, str): 

399 # Create a brand new NavigableString from this string. 

400 child = self.soup.new_string(node) 

401 

402 # Tell Beautiful Soup to act as if it parsed this element 

403 # immediately after the parent's last descendant. (Or 

404 # immediately after the parent, if it has no children.) 

405 if self.tag is not None and self.tag.contents: 

406 most_recent_element = self.tag._last_descendant(False) 

407 elif self.element.next_element is not None: 

408 # Something from further ahead in the parse tree is 

409 # being inserted into this earlier element. This is 

410 # very annoying because it means an expensive search 

411 # for the last element in the tree. 

412 most_recent_element = self.soup._last_descendant() 

413 else: 

414 most_recent_element = self.element 

415 

416 self.soup.object_was_parsed( 

417 child, parent=self.tag, most_recent_element=most_recent_element 

418 ) 

419 

420 def getAttributes(self) -> AttrList: 

421 assert self.tag is not None 

422 return AttrList(self.tag) 

423 

424 # An HTML5lib attribute name may either be a single string, 

425 # or a tuple (namespace, name). 

426 _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]] 

427 # Now we can define the type this method accepts as a dictionary 

428 # mapping those attribute names to single string values. 

429 _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str] 

430 

431 def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None: 

432 assert self.tag is not None 

433 if attributes is not None and len(attributes) > 0: 

434 # Replace any namespaced attributes with 

435 # NamespacedAttribute objects. 

436 for name, value in list(attributes.items()): 

437 if isinstance(name, tuple): 

438 new_name = NamespacedAttribute(*name) 

439 del attributes[name] 

440 attributes[new_name] = value 

441 

442 # We can now cast attributes to the type of Dict 

443 # used by Beautiful Soup. 

444 normalized_attributes = cast(_AttributeValues, attributes) 

445 

446 # Values for tags like 'class' came in as single strings; 

447 # replace them with lists of strings as appropriate. 

448 self.soup.builder._replace_cdata_list_attribute_values( 

449 self.name, normalized_attributes 

450 ) 

451 

452 # Then set the attributes on the Tag associated with this 

453 # BeautifulSoupNode. 

454 for name, value_or_values in list(normalized_attributes.items()): 

455 self.tag[name] = value_or_values 

456 

457 # The attributes may contain variables that need substitution. 

458 # Call set_up_substitutions manually. 

459 # 

460 # The Tag constructor called this method when the Tag was created, 

461 # but we just set/changed the attributes, so call it again. 

462 self.soup.builder.set_up_substitutions(self.tag) 

463 

464 attributes = property(getAttributes, setAttributes) 

465 

466 def insertText( 

467 self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None 

468 ) -> None: 

469 text = TextNode(self.soup.new_string(data), self.soup) 

470 if insertBefore: 

471 self.insertBefore(text, insertBefore) 

472 else: 

473 self.appendChild(text) 

474 

475 def insertBefore( 

476 self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode" 

477 ) -> None: 

478 assert self.tag is not None 

479 index = self.tag.index(refNode.element) 

480 if ( 

481 type(node.element) is NavigableString 

482 and self.tag.contents 

483 and type(self.tag.contents[index - 1]) is NavigableString 

484 ): 

485 # (See comments in appendChild) 

486 old_node = self.tag.contents[index - 1] 

487 assert type(old_node) is NavigableString 

488 new_str = self.soup.new_string(old_node + node.element) 

489 old_node.replace_with(new_str) 

490 else: 

491 self.tag.insert(index, node.element) 

492 node.parent = self 

493 

494 def removeChild(self, node: "Element") -> None: 

495 node.element.extract() 

496 

497 def reparentChildren(self, newParent: "Element") -> None: 

498 """Move all of this tag's children into another tag.""" 

499 # print("MOVE", self.element.contents) 

500 # print("FROM", self.element) 

501 # print("TO", new_parent.element) 

502 

503 element = self.tag 

504 assert element is not None 

505 new_parent_element = newParent.tag 

506 assert new_parent_element is not None 

507 # Determine what this tag's next_element will be once all the children 

508 # are removed. 

509 final_next_element = element.next_sibling 

510 

511 new_parents_last_descendant = new_parent_element._last_descendant(False, False) 

512 if len(new_parent_element.contents) > 0: 

513 # The new parent already contains children. We will be 

514 # appending this tag's children to the end. 

515 

516 # We can make this assertion since we know new_parent has 

517 # children. 

518 assert new_parents_last_descendant is not None 

519 new_parents_last_child = new_parent_element.contents[-1] 

520 new_parents_last_descendant_next_element = ( 

521 new_parents_last_descendant.next_element 

522 ) 

523 else: 

524 # The new parent contains no children. 

525 new_parents_last_child = None 

526 new_parents_last_descendant_next_element = new_parent_element.next_element 

527 

528 to_append = element.contents 

529 if len(to_append) > 0: 

530 # Set the first child's previous_element and previous_sibling 

531 # to elements within the new parent 

532 first_child = to_append[0] 

533 if new_parents_last_descendant is not None: 

534 first_child.previous_element = new_parents_last_descendant 

535 else: 

536 first_child.previous_element = new_parent_element 

537 first_child.previous_sibling = new_parents_last_child 

538 if new_parents_last_descendant is not None: 

539 new_parents_last_descendant.next_element = first_child 

540 else: 

541 new_parent_element.next_element = first_child 

542 if new_parents_last_child is not None: 

543 new_parents_last_child.next_sibling = first_child 

544 

545 # Find the very last element being moved. It is now the 

546 # parent's last descendant. It has no .next_sibling and 

547 # its .next_element is whatever the previous last 

548 # descendant had. 

549 last_childs_last_descendant = to_append[-1]._last_descendant( 

550 is_initialized=False, accept_self=True 

551 ) 

552 

553 # Since we passed accept_self=True into _last_descendant, 

554 # there's no possibility that the result is None. 

555 assert last_childs_last_descendant is not None 

556 last_childs_last_descendant.next_element = ( 

557 new_parents_last_descendant_next_element 

558 ) 

559 if new_parents_last_descendant_next_element is not None: 

560 # TODO-COVERAGE: This code has no test coverage and 

561 # I'm not sure how to get html5lib to go through this 

562 # path, but it's just the other side of the previous 

563 # line. 

564 new_parents_last_descendant_next_element.previous_element = ( 

565 last_childs_last_descendant 

566 ) 

567 last_childs_last_descendant.next_sibling = None 

568 

569 for child in to_append: 

570 child.parent = new_parent_element 

571 new_parent_element.contents.append(child) 

572 

573 # Now that this element has no children, change its .next_element. 

574 element.contents = [] 

575 element.next_element = final_next_element 

576 

577 # print("DONE WITH MOVE") 

578 # print("FROM", self.element) 

579 # print("TO", new_parent_element) 

580 

581 # TODO-TYPING: typeshed stubs are incorrect about this; 

582 # hasContent returns a boolean, not None. 

583 def hasContent(self) -> bool: # type:ignore 

584 return self.tag is None or len(self.tag.contents) > 0 

585 

586 # TODO-TYPING: typeshed stubs are incorrect about this; 

587 # cloneNode returns a new Node, not None. 

588 def cloneNode(self) -> treebuilder_base.Node: # type:ignore 

589 assert self.tag is not None 

590 tag = self.soup.new_tag(self.tag.name, self.namespace) 

591 node = Element(tag, self.soup, self.namespace) 

592 for key, value in self.attributes: 

593 node.attributes[key] = value 

594 return node 

595 

596 def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]: 

597 if self.namespace is None: 

598 return namespaces["html"], self.name 

599 else: 

600 return self.namespace, self.name 

601 

602 nameTuple = property(getNameTuple) 

603 

604 

605class TextNode(BeautifulSoupNode): 

606 

607 def __init__(self, element: NavigableString, soup: "BeautifulSoup"): 

608 treebuilder_base.Node.__init__(self, None) 

609 self.tag = None 

610 self.string = element 

611 self.soup = soup