Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_html5lib.py: 3%

264 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4__all__ = [ 

5 'HTML5TreeBuilder', 

6 ] 

7 

8import warnings 

9import re 

10from bs4.builder import ( 

11 DetectsXMLParsedAsHTML, 

12 PERMISSIVE, 

13 HTML, 

14 HTML_5, 

15 HTMLTreeBuilder, 

16 ) 

17from bs4.element import ( 

18 NamespacedAttribute, 

19 nonwhitespace_re, 

20) 

21import html5lib 

22from html5lib.constants import ( 

23 namespaces, 

24 prefixes, 

25 ) 

26from bs4.element import ( 

27 Comment, 

28 Doctype, 

29 NavigableString, 

30 Tag, 

31 ) 

32 

33try: 

34 # Pre-0.99999999 

35 from html5lib.treebuilders import _base as treebuilder_base 

36 new_html5lib = False 

37except ImportError as e: 

38 # 0.99999999 and up 

39 from html5lib.treebuilders import base as treebuilder_base 

40 new_html5lib = True 

41 

42class HTML5TreeBuilder(HTMLTreeBuilder): 

43 """Use html5lib to build a tree. 

44 

45 Note that this TreeBuilder does not support some features common 

46 to HTML TreeBuilders. Some of these features could theoretically 

47 be implemented, but at the very least it's quite difficult, 

48 because html5lib moves the parse tree around as it's being built. 

49 

50 * This TreeBuilder doesn't use different subclasses of NavigableString 

51 based on the name of the tag in which the string was found. 

52 

53 * You can't use a SoupStrainer to parse only part of a document. 

54 """ 

55 

56 NAME = "html5lib" 

57 

58 features = [NAME, PERMISSIVE, HTML_5, HTML] 

59 

60 # html5lib can tell us which line number and position in the 

61 # original file is the source of an element. 

62 TRACKS_LINE_NUMBERS = True 

63 

64 def prepare_markup(self, markup, user_specified_encoding, 

65 document_declared_encoding=None, exclude_encodings=None): 

66 # Store the user-specified encoding for use later on. 

67 self.user_specified_encoding = user_specified_encoding 

68 

69 # document_declared_encoding and exclude_encodings aren't used 

70 # ATM because the html5lib TreeBuilder doesn't use 

71 # UnicodeDammit. 

72 if exclude_encodings: 

73 warnings.warn( 

74 "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.", 

75 stacklevel=3 

76 ) 

77 

78 # html5lib only parses HTML, so if it's given XML that's worth 

79 # noting. 

80 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup) 

81 

82 yield (markup, None, None, False) 

83 

84 # These methods are defined by Beautiful Soup. 

85 def feed(self, markup): 

86 if self.soup.parse_only is not None: 

87 warnings.warn( 

88 "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", 

89 stacklevel=4 

90 ) 

91 parser = html5lib.HTMLParser(tree=self.create_treebuilder) 

92 self.underlying_builder.parser = parser 

93 extra_kwargs = dict() 

94 if not isinstance(markup, str): 

95 if new_html5lib: 

96 extra_kwargs['override_encoding'] = self.user_specified_encoding 

97 else: 

98 extra_kwargs['encoding'] = self.user_specified_encoding 

99 doc = parser.parse(markup, **extra_kwargs) 

100 

101 # Set the character encoding detected by the tokenizer. 

102 if isinstance(markup, str): 

103 # We need to special-case this because html5lib sets 

104 # charEncoding to UTF-8 if it gets Unicode input. 

105 doc.original_encoding = None 

106 else: 

107 original_encoding = parser.tokenizer.stream.charEncoding[0] 

108 if not isinstance(original_encoding, str): 

109 # In 0.99999999 and up, the encoding is an html5lib 

110 # Encoding object. We want to use a string for compatibility 

111 # with other tree builders. 

112 original_encoding = original_encoding.name 

113 doc.original_encoding = original_encoding 

114 self.underlying_builder.parser = None 

115 

116 def create_treebuilder(self, namespaceHTMLElements): 

117 self.underlying_builder = TreeBuilderForHtml5lib( 

118 namespaceHTMLElements, self.soup, 

119 store_line_numbers=self.store_line_numbers 

120 ) 

121 return self.underlying_builder 

122 

123 def test_fragment_to_document(self, fragment): 

124 """See `TreeBuilder`.""" 

125 return '<html><head></head><body>%s</body></html>' % fragment 

126 

127 

128class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 

129 

130 def __init__(self, namespaceHTMLElements, soup=None, 

131 store_line_numbers=True, **kwargs): 

132 if soup: 

133 self.soup = soup 

134 else: 

135 from bs4 import BeautifulSoup 

136 # TODO: Why is the parser 'html.parser' here? To avoid an 

137 # infinite loop? 

138 self.soup = BeautifulSoup( 

139 "", "html.parser", store_line_numbers=store_line_numbers, 

140 **kwargs 

141 ) 

142 # TODO: What are **kwargs exactly? Should they be passed in 

143 # here in addition to/instead of being passed to the BeautifulSoup 

144 # constructor? 

145 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 

146 

147 # This will be set later to an html5lib.html5parser.HTMLParser 

148 # object, which we can use to track the current line number. 

149 self.parser = None 

150 self.store_line_numbers = store_line_numbers 

151 

152 def documentClass(self): 

153 self.soup.reset() 

154 return Element(self.soup, self.soup, None) 

155 

156 def insertDoctype(self, token): 

157 name = token["name"] 

158 publicId = token["publicId"] 

159 systemId = token["systemId"] 

160 

161 doctype = Doctype.for_name_and_ids(name, publicId, systemId) 

162 self.soup.object_was_parsed(doctype) 

163 

164 def elementClass(self, name, namespace): 

165 kwargs = {} 

166 if self.parser and self.store_line_numbers: 

167 # This represents the point immediately after the end of the 

168 # tag. We don't know when the tag started, but we do know 

169 # where it ended -- the character just before this one. 

170 sourceline, sourcepos = self.parser.tokenizer.stream.position() 

171 kwargs['sourceline'] = sourceline 

172 kwargs['sourcepos'] = sourcepos-1 

173 tag = self.soup.new_tag(name, namespace, **kwargs) 

174 

175 return Element(tag, self.soup, namespace) 

176 

177 def commentClass(self, data): 

178 return TextNode(Comment(data), self.soup) 

179 

180 def fragmentClass(self): 

181 from bs4 import BeautifulSoup 

182 # TODO: Why is the parser 'html.parser' here? To avoid an 

183 # infinite loop? 

184 self.soup = BeautifulSoup("", "html.parser") 

185 self.soup.name = "[document_fragment]" 

186 return Element(self.soup, self.soup, None) 

187 

188 def appendChild(self, node): 

189 # XXX This code is not covered by the BS4 tests. 

190 self.soup.append(node.element) 

191 

192 def getDocument(self): 

193 return self.soup 

194 

195 def getFragment(self): 

196 return treebuilder_base.TreeBuilder.getFragment(self).element 

197 

198 def testSerializer(self, element): 

199 from bs4 import BeautifulSoup 

200 rv = [] 

201 doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') 

202 

203 def serializeElement(element, indent=0): 

204 if isinstance(element, BeautifulSoup): 

205 pass 

206 if isinstance(element, Doctype): 

207 m = doctype_re.match(element) 

208 if m: 

209 name = m.group(1) 

210 if m.lastindex > 1: 

211 publicId = m.group(2) or "" 

212 systemId = m.group(3) or m.group(4) or "" 

213 rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % 

214 (' ' * indent, name, publicId, systemId)) 

215 else: 

216 rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) 

217 else: 

218 rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) 

219 elif isinstance(element, Comment): 

220 rv.append("|%s<!-- %s -->" % (' ' * indent, element)) 

221 elif isinstance(element, NavigableString): 

222 rv.append("|%s\"%s\"" % (' ' * indent, element)) 

223 else: 

224 if element.namespace: 

225 name = "%s %s" % (prefixes[element.namespace], 

226 element.name) 

227 else: 

228 name = element.name 

229 rv.append("|%s<%s>" % (' ' * indent, name)) 

230 if element.attrs: 

231 attributes = [] 

232 for name, value in list(element.attrs.items()): 

233 if isinstance(name, NamespacedAttribute): 

234 name = "%s %s" % (prefixes[name.namespace], name.name) 

235 if isinstance(value, list): 

236 value = " ".join(value) 

237 attributes.append((name, value)) 

238 

239 for name, value in sorted(attributes): 

240 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 

241 indent += 2 

242 for child in element.children: 

243 serializeElement(child, indent) 

244 serializeElement(element, 0) 

245 

246 return "\n".join(rv) 

247 

248class AttrList(object): 

249 def __init__(self, element): 

250 self.element = element 

251 self.attrs = dict(self.element.attrs) 

252 def __iter__(self): 

253 return list(self.attrs.items()).__iter__() 

254 def __setitem__(self, name, value): 

255 # If this attribute is a multi-valued attribute for this element, 

256 # turn its value into a list. 

257 list_attr = self.element.cdata_list_attributes or {} 

258 if (name in list_attr.get('*', []) 

259 or (self.element.name in list_attr 

260 and name in list_attr.get(self.element.name, []))): 

261 # A node that is being cloned may have already undergone 

262 # this procedure. 

263 if not isinstance(value, list): 

264 value = nonwhitespace_re.findall(value) 

265 self.element[name] = value 

266 def items(self): 

267 return list(self.attrs.items()) 

268 def keys(self): 

269 return list(self.attrs.keys()) 

270 def __len__(self): 

271 return len(self.attrs) 

272 def __getitem__(self, name): 

273 return self.attrs[name] 

274 def __contains__(self, name): 

275 return name in list(self.attrs.keys()) 

276 

277 

278class Element(treebuilder_base.Node): 

279 def __init__(self, element, soup, namespace): 

280 treebuilder_base.Node.__init__(self, element.name) 

281 self.element = element 

282 self.soup = soup 

283 self.namespace = namespace 

284 

285 def appendChild(self, node): 

286 string_child = child = None 

287 if isinstance(node, str): 

288 # Some other piece of code decided to pass in a string 

289 # instead of creating a TextElement object to contain the 

290 # string. 

291 string_child = child = node 

292 elif isinstance(node, Tag): 

293 # Some other piece of code decided to pass in a Tag 

294 # instead of creating an Element object to contain the 

295 # Tag. 

296 child = node 

297 elif node.element.__class__ == NavigableString: 

298 string_child = child = node.element 

299 node.parent = self 

300 else: 

301 child = node.element 

302 node.parent = self 

303 

304 if not isinstance(child, str) and child.parent is not None: 

305 node.element.extract() 

306 

307 if (string_child is not None and self.element.contents 

308 and self.element.contents[-1].__class__ == NavigableString): 

309 # We are appending a string onto another string. 

310 # TODO This has O(n^2) performance, for input like 

311 # "a</a>a</a>a</a>..." 

312 old_element = self.element.contents[-1] 

313 new_element = self.soup.new_string(old_element + string_child) 

314 old_element.replace_with(new_element) 

315 self.soup._most_recent_element = new_element 

316 else: 

317 if isinstance(node, str): 

318 # Create a brand new NavigableString from this string. 

319 child = self.soup.new_string(node) 

320 

321 # Tell Beautiful Soup to act as if it parsed this element 

322 # immediately after the parent's last descendant. (Or 

323 # immediately after the parent, if it has no children.) 

324 if self.element.contents: 

325 most_recent_element = self.element._last_descendant(False) 

326 elif self.element.next_element is not None: 

327 # Something from further ahead in the parse tree is 

328 # being inserted into this earlier element. This is 

329 # very annoying because it means an expensive search 

330 # for the last element in the tree. 

331 most_recent_element = self.soup._last_descendant() 

332 else: 

333 most_recent_element = self.element 

334 

335 self.soup.object_was_parsed( 

336 child, parent=self.element, 

337 most_recent_element=most_recent_element) 

338 

339 def getAttributes(self): 

340 if isinstance(self.element, Comment): 

341 return {} 

342 return AttrList(self.element) 

343 

344 def setAttributes(self, attributes): 

345 if attributes is not None and len(attributes) > 0: 

346 converted_attributes = [] 

347 for name, value in list(attributes.items()): 

348 if isinstance(name, tuple): 

349 new_name = NamespacedAttribute(*name) 

350 del attributes[name] 

351 attributes[new_name] = value 

352 

353 self.soup.builder._replace_cdata_list_attribute_values( 

354 self.name, attributes) 

355 for name, value in list(attributes.items()): 

356 self.element[name] = value 

357 

358 # The attributes may contain variables that need substitution. 

359 # Call set_up_substitutions manually. 

360 # 

361 # The Tag constructor called this method when the Tag was created, 

362 # but we just set/changed the attributes, so call it again. 

363 self.soup.builder.set_up_substitutions(self.element) 

364 attributes = property(getAttributes, setAttributes) 

365 

366 def insertText(self, data, insertBefore=None): 

367 text = TextNode(self.soup.new_string(data), self.soup) 

368 if insertBefore: 

369 self.insertBefore(text, insertBefore) 

370 else: 

371 self.appendChild(text) 

372 

373 def insertBefore(self, node, refNode): 

374 index = self.element.index(refNode.element) 

375 if (node.element.__class__ == NavigableString and self.element.contents 

376 and self.element.contents[index-1].__class__ == NavigableString): 

377 # (See comments in appendChild) 

378 old_node = self.element.contents[index-1] 

379 new_str = self.soup.new_string(old_node + node.element) 

380 old_node.replace_with(new_str) 

381 else: 

382 self.element.insert(index, node.element) 

383 node.parent = self 

384 

385 def removeChild(self, node): 

386 node.element.extract() 

387 

388 def reparentChildren(self, new_parent): 

389 """Move all of this tag's children into another tag.""" 

390 # print("MOVE", self.element.contents) 

391 # print("FROM", self.element) 

392 # print("TO", new_parent.element) 

393 

394 element = self.element 

395 new_parent_element = new_parent.element 

396 # Determine what this tag's next_element will be once all the children 

397 # are removed. 

398 final_next_element = element.next_sibling 

399 

400 new_parents_last_descendant = new_parent_element._last_descendant(False, False) 

401 if len(new_parent_element.contents) > 0: 

402 # The new parent already contains children. We will be 

403 # appending this tag's children to the end. 

404 new_parents_last_child = new_parent_element.contents[-1] 

405 new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 

406 else: 

407 # The new parent contains no children. 

408 new_parents_last_child = None 

409 new_parents_last_descendant_next_element = new_parent_element.next_element 

410 

411 to_append = element.contents 

412 if len(to_append) > 0: 

413 # Set the first child's previous_element and previous_sibling 

414 # to elements within the new parent 

415 first_child = to_append[0] 

416 if new_parents_last_descendant is not None: 

417 first_child.previous_element = new_parents_last_descendant 

418 else: 

419 first_child.previous_element = new_parent_element 

420 first_child.previous_sibling = new_parents_last_child 

421 if new_parents_last_descendant is not None: 

422 new_parents_last_descendant.next_element = first_child 

423 else: 

424 new_parent_element.next_element = first_child 

425 if new_parents_last_child is not None: 

426 new_parents_last_child.next_sibling = first_child 

427 

428 # Find the very last element being moved. It is now the 

429 # parent's last descendant. It has no .next_sibling and 

430 # its .next_element is whatever the previous last 

431 # descendant had. 

432 last_childs_last_descendant = to_append[-1]._last_descendant(False, True) 

433 

434 last_childs_last_descendant.next_element = new_parents_last_descendant_next_element 

435 if new_parents_last_descendant_next_element is not None: 

436 # TODO: This code has no test coverage and I'm not sure 

437 # how to get html5lib to go through this path, but it's 

438 # just the other side of the previous line. 

439 new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant 

440 last_childs_last_descendant.next_sibling = None 

441 

442 for child in to_append: 

443 child.parent = new_parent_element 

444 new_parent_element.contents.append(child) 

445 

446 # Now that this element has no children, change its .next_element. 

447 element.contents = [] 

448 element.next_element = final_next_element 

449 

450 # print("DONE WITH MOVE") 

451 # print("FROM", self.element) 

452 # print("TO", new_parent_element) 

453 

454 def cloneNode(self): 

455 tag = self.soup.new_tag(self.element.name, self.namespace) 

456 node = Element(tag, self.soup, self.namespace) 

457 for key,value in self.attributes: 

458 node.attributes[key] = value 

459 return node 

460 

461 def hasContent(self): 

462 return self.element.contents 

463 

464 def getNameTuple(self): 

465 if self.namespace == None: 

466 return namespaces["html"], self.name 

467 else: 

468 return self.namespace, self.name 

469 

470 nameTuple = property(getNameTuple) 

471 

472class TextNode(Element): 

473 def __init__(self, element, soup): 

474 treebuilder_base.Node.__init__(self, None) 

475 self.element = element 

476 self.soup = soup 

477 

478 def cloneNode(self): 

479 raise NotImplementedError