Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_html5lib.py: 48%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

264 statements  

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4__all__ = [ 

5 'HTML5TreeBuilder', 

6 ] 

7 

8import warnings 

9import re 

10from bs4.builder import ( 

11 DetectsXMLParsedAsHTML, 

12 PERMISSIVE, 

13 HTML, 

14 HTML_5, 

15 HTMLTreeBuilder, 

16 ) 

17from bs4.element import ( 

18 NamespacedAttribute, 

19 nonwhitespace_re, 

20) 

21import html5lib 

22from html5lib.constants import ( 

23 namespaces, 

24 prefixes, 

25 ) 

26from bs4.element import ( 

27 Comment, 

28 Doctype, 

29 NavigableString, 

30 Tag, 

31 ) 

32 

33try: 

34 # Pre-0.99999999 

35 from html5lib.treebuilders import _base as treebuilder_base 

36 new_html5lib = False 

37except ImportError as e: 

38 # 0.99999999 and up 

39 from html5lib.treebuilders import base as treebuilder_base 

40 new_html5lib = True 

41 

42class HTML5TreeBuilder(HTMLTreeBuilder): 

43 """Use html5lib to build a tree. 

44 

45 Note that this TreeBuilder does not support some features common 

46 to HTML TreeBuilders. Some of these features could theoretically 

47 be implemented, but at the very least it's quite difficult, 

48 because html5lib moves the parse tree around as it's being built. 

49 

50 * This TreeBuilder doesn't use different subclasses of NavigableString 

51 based on the name of the tag in which the string was found. 

52 

53 * You can't use a SoupStrainer to parse only part of a document. 

54 """ 

55 

56 NAME = "html5lib" 

57 

58 features = [NAME, PERMISSIVE, HTML_5, HTML] 

59 

60 # html5lib can tell us which line number and position in the 

61 # original file is the source of an element. 

62 TRACKS_LINE_NUMBERS = True 

63 

64 def prepare_markup(self, markup, user_specified_encoding, 

65 document_declared_encoding=None, exclude_encodings=None): 

66 # Store the user-specified encoding for use later on. 

67 self.user_specified_encoding = user_specified_encoding 

68 

69 # document_declared_encoding and exclude_encodings aren't used 

70 # ATM because the html5lib TreeBuilder doesn't use 

71 # UnicodeDammit. 

72 if exclude_encodings: 

73 warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 

74 

75 # html5lib only parses HTML, so if it's given XML that's worth 

76 # noting. 

77 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup) 

78 

79 yield (markup, None, None, False) 

80 

81 # These methods are defined by Beautiful Soup. 

82 def feed(self, markup): 

83 if self.soup.parse_only is not None: 

84 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 

85 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 

86 self.underlying_builder.parser = parser 

87 extra_kwargs = dict() 

88 if not isinstance(markup, str): 

89 if new_html5lib: 

90 extra_kwargs['override_encoding'] = self.user_specified_encoding 

91 else: 

92 extra_kwargs['encoding'] = self.user_specified_encoding 

93 doc = parser.parse(markup, **extra_kwargs) 

94 

95 # Set the character encoding detected by the tokenizer. 

96 if isinstance(markup, str): 

97 # We need to special-case this because html5lib sets 

98 # charEncoding to UTF-8 if it gets Unicode input. 

99 doc.original_encoding = None 

100 else: 

101 original_encoding = parser.tokenizer.stream.charEncoding[0] 

102 if not isinstance(original_encoding, str): 

103 # In 0.99999999 and up, the encoding is an html5lib 

104 # Encoding object. We want to use a string for compatibility 

105 # with other tree builders. 

106 original_encoding = original_encoding.name 

107 doc.original_encoding = original_encoding 

108 self.underlying_builder.parser = None 

109 

110 def create_treebuilder(self, namespaceHTMLElements): 

111 self.underlying_builder = TreeBuilderForHtml5lib( 

112 namespaceHTMLElements, self.soup, 

113 store_line_numbers=self.store_line_numbers 

114 ) 

115 return self.underlying_builder 

116 

117 def test_fragment_to_document(self, fragment): 

118 """See `TreeBuilder`.""" 

119 return '<html><head></head><body>%s</body></html>' % fragment 

120 

121 

122class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 

123 

124 def __init__(self, namespaceHTMLElements, soup=None, 

125 store_line_numbers=True, **kwargs): 

126 if soup: 

127 self.soup = soup 

128 else: 

129 from bs4 import BeautifulSoup 

130 # TODO: Why is the parser 'html.parser' here? To avoid an 

131 # infinite loop? 

132 self.soup = BeautifulSoup( 

133 "", "html.parser", store_line_numbers=store_line_numbers, 

134 **kwargs 

135 ) 

136 # TODO: What are **kwargs exactly? Should they be passed in 

137 # here in addition to/instead of being passed to the BeautifulSoup 

138 # constructor? 

139 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 

140 

141 # This will be set later to an html5lib.html5parser.HTMLParser 

142 # object, which we can use to track the current line number. 

143 self.parser = None 

144 self.store_line_numbers = store_line_numbers 

145 

146 def documentClass(self): 

147 self.soup.reset() 

148 return Element(self.soup, self.soup, None) 

149 

150 def insertDoctype(self, token): 

151 name = token["name"] 

152 publicId = token["publicId"] 

153 systemId = token["systemId"] 

154 

155 doctype = Doctype.for_name_and_ids(name, publicId, systemId) 

156 self.soup.object_was_parsed(doctype) 

157 

158 def elementClass(self, name, namespace): 

159 kwargs = {} 

160 if self.parser and self.store_line_numbers: 

161 # This represents the point immediately after the end of the 

162 # tag. We don't know when the tag started, but we do know 

163 # where it ended -- the character just before this one. 

164 sourceline, sourcepos = self.parser.tokenizer.stream.position() 

165 kwargs['sourceline'] = sourceline 

166 kwargs['sourcepos'] = sourcepos-1 

167 tag = self.soup.new_tag(name, namespace, **kwargs) 

168 

169 return Element(tag, self.soup, namespace) 

170 

171 def commentClass(self, data): 

172 return TextNode(Comment(data), self.soup) 

173 

174 def fragmentClass(self): 

175 from bs4 import BeautifulSoup 

176 # TODO: Why is the parser 'html.parser' here? To avoid an 

177 # infinite loop? 

178 self.soup = BeautifulSoup("", "html.parser") 

179 self.soup.name = "[document_fragment]" 

180 return Element(self.soup, self.soup, None) 

181 

182 def appendChild(self, node): 

183 # XXX This code is not covered by the BS4 tests. 

184 self.soup.append(node.element) 

185 

186 def getDocument(self): 

187 return self.soup 

188 

189 def getFragment(self): 

190 return treebuilder_base.TreeBuilder.getFragment(self).element 

191 

192 def testSerializer(self, element): 

193 from bs4 import BeautifulSoup 

194 rv = [] 

195 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') 

196 

197 def serializeElement(element, indent=0): 

198 if isinstance(element, BeautifulSoup): 

199 pass 

200 if isinstance(element, Doctype): 

201 m = doctype_re.match(element) 

202 if m: 

203 name = m.group(1) 

204 if m.lastindex > 1: 

205 publicId = m.group(2) or "" 

206 systemId = m.group(3) or m.group(4) or "" 

207 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % 

208 (' ' * indent, name, publicId, systemId)) 

209 else: 

210 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) 

211 else: 

212 rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) 

213 elif isinstance(element, Comment): 

214 rv.append("|%s<!-- %s -->" % (' ' * indent, element)) 

215 elif isinstance(element, NavigableString): 

216 rv.append("|%s\"%s\"" % (' ' * indent, element)) 

217 else: 

218 if element.namespace: 

219 name = "%s %s" % (prefixes[element.namespace], 

220 element.name) 

221 else: 

222 name = element.name 

223 rv.append("|%s<%s>" % (' ' * indent, name)) 

224 if element.attrs: 

225 attributes = [] 

226 for name, value in list(element.attrs.items()): 

227 if isinstance(name, NamespacedAttribute): 

228 name = "%s %s" % (prefixes[name.namespace], name.name) 

229 if isinstance(value, list): 

230 value = " ".join(value) 

231 attributes.append((name, value)) 

232 

233 for name, value in sorted(attributes): 

234 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 

235 indent += 2 

236 for child in element.children: 

237 serializeElement(child, indent) 

238 serializeElement(element, 0) 

239 

240 return "\n".join(rv) 

241 

242class AttrList(object): 

243 def __init__(self, element): 

244 self.element = element 

245 self.attrs = dict(self.element.attrs) 

246 def __iter__(self): 

247 return list(self.attrs.items()).__iter__() 

248 def __setitem__(self, name, value): 

249 # If this attribute is a multi-valued attribute for this element, 

250 # turn its value into a list. 

251 list_attr = self.element.cdata_list_attributes or {} 

252 if (name in list_attr.get('*', []) 

253 or (self.element.name in list_attr 

254 and name in list_attr.get(self.element.name, []))): 

255 # A node that is being cloned may have already undergone 

256 # this procedure. 

257 if not isinstance(value, list): 

258 value = nonwhitespace_re.findall(value) 

259 self.element[name] = value 

260 def items(self): 

261 return list(self.attrs.items()) 

262 def keys(self): 

263 return list(self.attrs.keys()) 

264 def __len__(self): 

265 return len(self.attrs) 

266 def __getitem__(self, name): 

267 return self.attrs[name] 

268 def __contains__(self, name): 

269 return name in list(self.attrs.keys()) 

270 

271 

272class Element(treebuilder_base.Node): 

273 def __init__(self, element, soup, namespace): 

274 treebuilder_base.Node.__init__(self, element.name) 

275 self.element = element 

276 self.soup = soup 

277 self.namespace = namespace 

278 

279 def appendChild(self, node): 

280 string_child = child = None 

281 if isinstance(node, str): 

282 # Some other piece of code decided to pass in a string 

283 # instead of creating a TextElement object to contain the 

284 # string. 

285 string_child = child = node 

286 elif isinstance(node, Tag): 

287 # Some other piece of code decided to pass in a Tag 

288 # instead of creating an Element object to contain the 

289 # Tag. 

290 child = node 

291 elif node.element.__class__ == NavigableString: 

292 string_child = child = node.element 

293 node.parent = self 

294 else: 

295 child = node.element 

296 node.parent = self 

297 

298 if not isinstance(child, str) and child.parent is not None: 

299 node.element.extract() 

300 

301 if (string_child is not None and self.element.contents 

302 and self.element.contents[-1].__class__ == NavigableString): 

303 # We are appending a string onto another string. 

304 # TODO This has O(n^2) performance, for input like 

305 # "a</a>a</a>a</a>..." 

306 old_element = self.element.contents[-1] 

307 new_element = self.soup.new_string(old_element + string_child) 

308 old_element.replace_with(new_element) 

309 self.soup._most_recent_element = new_element 

310 else: 

311 if isinstance(node, str): 

312 # Create a brand new NavigableString from this string. 

313 child = self.soup.new_string(node) 

314 

315 # Tell Beautiful Soup to act as if it parsed this element 

316 # immediately after the parent's last descendant. (Or 

317 # immediately after the parent, if it has no children.) 

318 if self.element.contents: 

319 most_recent_element = self.element._last_descendant(False) 

320 elif self.element.next_element is not None: 

321 # Something from further ahead in the parse tree is 

322 # being inserted into this earlier element. This is 

323 # very annoying because it means an expensive search 

324 # for the last element in the tree. 

325 most_recent_element = self.soup._last_descendant() 

326 else: 

327 most_recent_element = self.element 

328 

329 self.soup.object_was_parsed( 

330 child, parent=self.element, 

331 most_recent_element=most_recent_element) 

332 

333 def getAttributes(self): 

334 if isinstance(self.element, Comment): 

335 return {} 

336 return AttrList(self.element) 

337 

338 def setAttributes(self, attributes): 

339 if attributes is not None and len(attributes) > 0: 

340 converted_attributes = [] 

341 for name, value in list(attributes.items()): 

342 if isinstance(name, tuple): 

343 new_name = NamespacedAttribute(*name) 

344 del attributes[name] 

345 attributes[new_name] = value 

346 

347 self.soup.builder._replace_cdata_list_attribute_values( 

348 self.name, attributes) 

349 for name, value in list(attributes.items()): 

350 self.element[name] = value 

351 

352 # The attributes may contain variables that need substitution. 

353 # Call set_up_substitutions manually. 

354 # 

355 # The Tag constructor called this method when the Tag was created, 

356 # but we just set/changed the attributes, so call it again. 

357 self.soup.builder.set_up_substitutions(self.element) 

358 attributes = property(getAttributes, setAttributes) 

359 

360 def insertText(self, data, insertBefore=None): 

361 text = TextNode(self.soup.new_string(data), self.soup) 

362 if insertBefore: 

363 self.insertBefore(text, insertBefore) 

364 else: 

365 self.appendChild(text) 

366 

367 def insertBefore(self, node, refNode): 

368 index = self.element.index(refNode.element) 

369 if (node.element.__class__ == NavigableString and self.element.contents 

370 and self.element.contents[index-1].__class__ == NavigableString): 

371 # (See comments in appendChild) 

372 old_node = self.element.contents[index-1] 

373 new_str = self.soup.new_string(old_node + node.element) 

374 old_node.replace_with(new_str) 

375 else: 

376 self.element.insert(index, node.element) 

377 node.parent = self 

378 

379 def removeChild(self, node): 

380 node.element.extract() 

381 

382 def reparentChildren(self, new_parent): 

383 """Move all of this tag's children into another tag.""" 

384 # print("MOVE", self.element.contents) 

385 # print("FROM", self.element) 

386 # print("TO", new_parent.element) 

387 

388 element = self.element 

389 new_parent_element = new_parent.element 

390 # Determine what this tag's next_element will be once all the children 

391 # are removed. 

392 final_next_element = element.next_sibling 

393 

394 new_parents_last_descendant = new_parent_element._last_descendant(False, False) 

395 if len(new_parent_element.contents) > 0: 

396 # The new parent already contains children. We will be 

397 # appending this tag's children to the end. 

398 new_parents_last_child = new_parent_element.contents[-1] 

399 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 

400 else: 

401 # The new parent contains no children. 

402 new_parents_last_child = None 

403 new_parents_last_descendant_next_element = new_parent_element.next_element 

404 

405 to_append = element.contents 

406 if len(to_append) > 0: 

407 # Set the first child's previous_element and previous_sibling 

408 # to elements within the new parent 

409 first_child = to_append[0] 

410 if new_parents_last_descendant is not None: 

411 first_child.previous_element = new_parents_last_descendant 

412 else: 

413 first_child.previous_element = new_parent_element 

414 first_child.previous_sibling = new_parents_last_child 

415 if new_parents_last_descendant is not None: 

416 new_parents_last_descendant.next_element = first_child 

417 else: 

418 new_parent_element.next_element = first_child 

419 if new_parents_last_child is not None: 

420 new_parents_last_child.next_sibling = first_child 

421 

422 # Find the very last element being moved. It is now the 

423 # parent's last descendant. It has no .next_sibling and 

424 # its .next_element is whatever the previous last 

425 # descendant had. 

426 last_childs_last_descendant = to_append[-1]._last_descendant(False, True) 

427 

428 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element 

429 if new_parents_last_descendant_next_element is not None: 

430 # TODO: This code has no test coverage and I'm not sure 

431 # how to get html5lib to go through this path, but it's 

432 # just the other side of the previous line. 

433 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant 

434 last_childs_last_descendant.next_sibling = None 

435 

436 for child in to_append: 

437 child.parent = new_parent_element 

438 new_parent_element.contents.append(child) 

439 

440 # Now that this element has no children, change its .next_element. 

441 element.contents = [] 

442 element.next_element = final_next_element 

443 

444 # print("DONE WITH MOVE") 

445 # print("FROM", self.element) 

446 # print("TO", new_parent_element) 

447 

448 def cloneNode(self): 

449 tag = self.soup.new_tag(self.element.name, self.namespace) 

450 node = Element(tag, self.soup, self.namespace) 

451 for key,value in self.attributes: 

452 node.attributes[key] = value 

453 return node 

454 

455 def hasContent(self): 

456 return self.element.contents 

457 

458 def getNameTuple(self): 

459 if self.namespace == None: 

460 return namespaces["html"], self.name 

461 else: 

462 return self.namespace, self.name 

463 

464 nameTuple = property(getNameTuple) 

465 

466class TextNode(Element): 

467 def __init__(self, element, soup): 

468 treebuilder_base.Node.__init__(self, None) 

469 self.element = element 

470 self.soup = soup 

471 

472 def cloneNode(self): 

473 raise NotImplementedError