Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lxml/html/__init__.py: 1%

967 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-07 06:38 +0000

1# Copyright (c) 2004 Ian Bicking. All rights reserved. 

2# 

3# Redistribution and use in source and binary forms, with or without 

4# modification, are permitted provided that the following conditions are 

5# met: 

6# 

7# 1. Redistributions of source code must retain the above copyright 

8# notice, this list of conditions and the following disclaimer. 

9# 

10# 2. Redistributions in binary form must reproduce the above copyright 

11# notice, this list of conditions and the following disclaimer in 

12# the documentation and/or other materials provided with the 

13# distribution. 

14# 

15# 3. Neither the name of Ian Bicking nor the names of its contributors may 

16# be used to endorse or promote products derived from this software 

17# without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

30 

31"""The ``lxml.html`` tool set for HTML handling. 

32""" 

33 

34 

35__all__ = [ 

36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 

37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 

38 'find_rel_links', 'find_class', 'make_links_absolute', 

39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] 

40 

41 

42import copy 

43import re 

44 

45from collections.abc import MutableMapping, MutableSet 

46from functools import partial 

47from urllib.parse import urljoin 

48 

49from .. import etree 

50from . import defs 

51from ._setmixin import SetMixin 

52 

53 

54def __fix_docstring(s): 

55 # TODO: remove and clean up doctests 

56 if not s: 

57 return s 

58 sub = re.compile(r"^(\s*)u'", re.M).sub 

59 return sub(r"\1'", s) 

60 

61 

62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 

63 

64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 

65 namespaces={'x':XHTML_NAMESPACE}) 

66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 

67 namespaces={'x':XHTML_NAMESPACE}) 

68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 

69 namespaces={'x':XHTML_NAMESPACE}) 

70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 

71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 

72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 

73_collect_string_content = etree.XPath("string()") 

74_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 

75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer 

76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 

77 namespaces={'x':XHTML_NAMESPACE}) 

78_archive_re = re.compile(r'[^ ]+') 

79_parse_meta_refresh_url = re.compile( 

80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 

81 

82 

83def _unquote_match(s, pos): 

84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 

85 return s[1:-1], pos+1 

86 else: 

87 return s,pos 

88 

89 

90def _transform_result(typ, result): 

91 """Convert the result back into the input type. 

92 """ 

93 if issubclass(typ, bytes): 

94 return tostring(result, encoding='utf-8') 

95 elif issubclass(typ, str): 

96 return tostring(result, encoding='unicode') 

97 else: 

98 return result 

99 

100 

101def _nons(tag): 

102 if isinstance(tag, str): 

103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 

104 return tag.split('}')[-1] 

105 return tag 

106 

107 

108class Classes(MutableSet): 

109 """Provides access to an element's class attribute as a set-like collection. 

110 Usage:: 

111 

112 >>> el = fromstring('<p class="hidden large">Text</p>') 

113 >>> classes = el.classes # or: classes = Classes(el.attrib) 

114 >>> classes |= ['block', 'paragraph'] 

115 >>> el.get('class') 

116 'hidden large block paragraph' 

117 >>> classes.toggle('hidden') 

118 False 

119 >>> el.get('class') 

120 'large block paragraph' 

121 >>> classes -= ('some', 'classes', 'block') 

122 >>> el.get('class') 

123 'large paragraph' 

124 """ 

125 def __init__(self, attributes): 

126 self._attributes = attributes 

127 self._get_class_value = partial(attributes.get, 'class', '') 

128 

129 def add(self, value): 

130 """ 

131 Add a class. 

132 

133 This has no effect if the class is already present. 

134 """ 

135 if not value or re.search(r'\s', value): 

136 raise ValueError("Invalid class name: %r" % value) 

137 classes = self._get_class_value().split() 

138 if value in classes: 

139 return 

140 classes.append(value) 

141 self._attributes['class'] = ' '.join(classes) 

142 

143 def discard(self, value): 

144 """ 

145 Remove a class if it is currently present. 

146 

147 If the class is not present, do nothing. 

148 """ 

149 if not value or re.search(r'\s', value): 

150 raise ValueError("Invalid class name: %r" % value) 

151 classes = [name for name in self._get_class_value().split() 

152 if name != value] 

153 if classes: 

154 self._attributes['class'] = ' '.join(classes) 

155 elif 'class' in self._attributes: 

156 del self._attributes['class'] 

157 

158 def remove(self, value): 

159 """ 

160 Remove a class; it must currently be present. 

161 

162 If the class is not present, raise a KeyError. 

163 """ 

164 if not value or re.search(r'\s', value): 

165 raise ValueError("Invalid class name: %r" % value) 

166 super().remove(value) 

167 

168 def __contains__(self, name): 

169 classes = self._get_class_value() 

170 return name in classes and name in classes.split() 

171 

172 def __iter__(self): 

173 return iter(self._get_class_value().split()) 

174 

175 def __len__(self): 

176 return len(self._get_class_value().split()) 

177 

178 # non-standard methods 

179 

180 def update(self, values): 

181 """ 

182 Add all names from 'values'. 

183 """ 

184 classes = self._get_class_value().split() 

185 extended = False 

186 for value in values: 

187 if value not in classes: 

188 classes.append(value) 

189 extended = True 

190 if extended: 

191 self._attributes['class'] = ' '.join(classes) 

192 

193 def toggle(self, value): 

194 """ 

195 Add a class name if it isn't there yet, or remove it if it exists. 

196 

197 Returns true if the class was added (and is now enabled) and 

198 false if it was removed (and is now disabled). 

199 """ 

200 if not value or re.search(r'\s', value): 

201 raise ValueError("Invalid class name: %r" % value) 

202 classes = self._get_class_value().split() 

203 try: 

204 classes.remove(value) 

205 enabled = False 

206 except ValueError: 

207 classes.append(value) 

208 enabled = True 

209 if classes: 

210 self._attributes['class'] = ' '.join(classes) 

211 else: 

212 del self._attributes['class'] 

213 return enabled 

214 

215 

216class HtmlMixin: 

217 

218 def set(self, key, value=None): 

219 """set(self, key, value=None) 

220 

221 Sets an element attribute. If no value is provided, or if the value is None, 

222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 

223 for ``form.set('novalidate')``. 

224 """ 

225 super().set(key, value) 

226 

227 @property 

228 def classes(self): 

229 """ 

230 A set-like wrapper around the 'class' attribute. 

231 """ 

232 return Classes(self.attrib) 

233 

234 @classes.setter 

235 def classes(self, classes): 

236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 

237 value = classes._get_class_value() 

238 if value: 

239 self.set('class', value) 

240 elif self.get('class') is not None: 

241 del self.attrib['class'] 

242 

243 @property 

244 def base_url(self): 

245 """ 

246 Returns the base URL, given when the page was parsed. 

247 

248 Use with ``urlparse.urljoin(el.base_url, href)`` to get 

249 absolute URLs. 

250 """ 

251 return self.getroottree().docinfo.URL 

252 

253 @property 

254 def forms(self): 

255 """ 

256 Return a list of all the forms 

257 """ 

258 return _forms_xpath(self) 

259 

260 @property 

261 def body(self): 

262 """ 

263 Return the <body> element. Can be called from a child element 

264 to get the document's head. 

265 """ 

266 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 

267 

268 @property 

269 def head(self): 

270 """ 

271 Returns the <head> element. Can be called from a child 

272 element to get the document's head. 

273 """ 

274 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 

275 

276 @property 

277 def label(self): 

278 """ 

279 Get or set any <label> element associated with this element. 

280 """ 

281 id = self.get('id') 

282 if not id: 

283 return None 

284 result = _label_xpath(self, id=id) 

285 if not result: 

286 return None 

287 else: 

288 return result[0] 

289 

290 @label.setter 

291 def label(self, label): 

292 id = self.get('id') 

293 if not id: 

294 raise TypeError( 

295 "You cannot set a label for an element (%r) that has no id" 

296 % self) 

297 if _nons(label.tag) != 'label': 

298 raise TypeError( 

299 "You can only assign label to a label element (not %r)" 

300 % label) 

301 label.set('for', id) 

302 

303 @label.deleter 

304 def label(self): 

305 label = self.label 

306 if label is not None: 

307 del label.attrib['for'] 

308 

309 def drop_tree(self): 

310 """ 

311 Removes this element from the tree, including its children and 

312 text. The tail text is joined to the previous element or 

313 parent. 

314 """ 

315 parent = self.getparent() 

316 assert parent is not None 

317 if self.tail: 

318 previous = self.getprevious() 

319 if previous is None: 

320 parent.text = (parent.text or '') + self.tail 

321 else: 

322 previous.tail = (previous.tail or '') + self.tail 

323 parent.remove(self) 

324 

325 def drop_tag(self): 

326 """ 

327 Remove the tag, but not its children or text. The children and text 

328 are merged into the parent. 

329 

330 Example:: 

331 

332 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 

333 >>> h.find('.//b').drop_tag() 

334 >>> print(tostring(h, encoding='unicode')) 

335 <div>Hello World!</div> 

336 """ 

337 parent = self.getparent() 

338 assert parent is not None 

339 previous = self.getprevious() 

340 if self.text and isinstance(self.tag, str): 

341 # not a Comment, etc. 

342 if previous is None: 

343 parent.text = (parent.text or '') + self.text 

344 else: 

345 previous.tail = (previous.tail or '') + self.text 

346 if self.tail: 

347 if len(self): 

348 last = self[-1] 

349 last.tail = (last.tail or '') + self.tail 

350 elif previous is None: 

351 parent.text = (parent.text or '') + self.tail 

352 else: 

353 previous.tail = (previous.tail or '') + self.tail 

354 index = parent.index(self) 

355 parent[index:index+1] = self[:] 

356 

357 def find_rel_links(self, rel): 

358 """ 

359 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 

360 """ 

361 rel = rel.lower() 

362 return [el for el in _rel_links_xpath(self) 

363 if el.get('rel').lower() == rel] 

364 

365 def find_class(self, class_name): 

366 """ 

367 Find any elements with the given class name. 

368 """ 

369 return _class_xpath(self, class_name=class_name) 

370 

371 def get_element_by_id(self, id, *default): 

372 """ 

373 Get the first element in a document with the given id. If none is 

374 found, return the default argument if provided or raise KeyError 

375 otherwise. 

376 

377 Note that there can be more than one element with the same id, 

378 and this isn't uncommon in HTML documents found in the wild. 

379 Browsers return only the first match, and this function does 

380 the same. 

381 """ 

382 try: 

383 # FIXME: should this check for multiple matches? 

384 # browsers just return the first one 

385 return _id_xpath(self, id=id)[0] 

386 except IndexError: 

387 if default: 

388 return default[0] 

389 else: 

390 raise KeyError(id) 

391 

392 def text_content(self): 

393 """ 

394 Return the text content of the tag (and the text in any children). 

395 """ 

396 return _collect_string_content(self) 

397 

398 def cssselect(self, expr, translator='html'): 

399 """ 

400 Run the CSS expression on this element and its children, 

401 returning a list of the results. 

402 

403 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 

404 -- note that pre-compiling the expression can provide a substantial 

405 speedup. 

406 """ 

407 # Do the import here to make the dependency optional. 

408 from lxml.cssselect import CSSSelector 

409 return CSSSelector(expr, translator=translator)(self) 

410 

411 ######################################## 

412 ## Link functions 

413 ######################################## 

414 

415 def make_links_absolute(self, base_url=None, resolve_base_href=True, 

416 handle_failures=None): 

417 """ 

418 Make all links in the document absolute, given the 

419 ``base_url`` for the document (the full URL where the document 

420 came from), or if no ``base_url`` is given, then the ``.base_url`` 

421 of the document. 

422 

423 If ``resolve_base_href`` is true, then any ``<base href>`` 

424 tags in the document are used *and* removed from the document. 

425 If it is false then any such tag is ignored. 

426 

427 If ``handle_failures`` is None (default), a failure to process 

428 a URL will abort the processing. If set to 'ignore', errors 

429 are ignored. If set to 'discard', failing URLs will be removed. 

430 """ 

431 if base_url is None: 

432 base_url = self.base_url 

433 if base_url is None: 

434 raise TypeError( 

435 "No base_url given, and the document has no base_url") 

436 if resolve_base_href: 

437 self.resolve_base_href() 

438 

439 if handle_failures == 'ignore': 

440 def link_repl(href): 

441 try: 

442 return urljoin(base_url, href) 

443 except ValueError: 

444 return href 

445 elif handle_failures == 'discard': 

446 def link_repl(href): 

447 try: 

448 return urljoin(base_url, href) 

449 except ValueError: 

450 return None 

451 elif handle_failures is None: 

452 def link_repl(href): 

453 return urljoin(base_url, href) 

454 else: 

455 raise ValueError( 

456 "unexpected value for handle_failures: %r" % handle_failures) 

457 

458 self.rewrite_links(link_repl) 

459 

460 def resolve_base_href(self, handle_failures=None): 

461 """ 

462 Find any ``<base href>`` tag in the document, and apply its 

463 values to all links found in the document. Also remove the 

464 tag once it has been applied. 

465 

466 If ``handle_failures`` is None (default), a failure to process 

467 a URL will abort the processing. If set to 'ignore', errors 

468 are ignored. If set to 'discard', failing URLs will be removed. 

469 """ 

470 base_href = None 

471 basetags = self.xpath('//base[@href]|//x:base[@href]', 

472 namespaces={'x': XHTML_NAMESPACE}) 

473 for b in basetags: 

474 base_href = b.get('href') 

475 b.drop_tree() 

476 if not base_href: 

477 return 

478 self.make_links_absolute(base_href, resolve_base_href=False, 

479 handle_failures=handle_failures) 

480 

481 def iterlinks(self): 

482 """ 

483 Yield (element, attribute, link, pos), where attribute may be None 

484 (indicating the link is in the text). ``pos`` is the position 

485 where the link occurs; often 0, but sometimes something else in 

486 the case of links in stylesheets or style tags. 

487 

488 Note: <base href> is *not* taken into account in any way. The 

489 link you get is exactly the link in the document. 

490 

491 Note: multiple links inside of a single text string or 

492 attribute value are returned in reversed order. This makes it 

493 possible to replace or delete them from the text string value 

494 based on their reported text positions. Otherwise, a 

495 modification at one text position can change the positions of 

496 links reported later on. 

497 """ 

498 link_attrs = defs.link_attrs 

499 for el in self.iter(etree.Element): 

500 attribs = el.attrib 

501 tag = _nons(el.tag) 

502 if tag == 'object': 

503 codebase = None 

504 ## <object> tags have attributes that are relative to 

505 ## codebase 

506 if 'codebase' in attribs: 

507 codebase = el.get('codebase') 

508 yield (el, 'codebase', codebase, 0) 

509 for attrib in ('classid', 'data'): 

510 if attrib in attribs: 

511 value = el.get(attrib) 

512 if codebase is not None: 

513 value = urljoin(codebase, value) 

514 yield (el, attrib, value, 0) 

515 if 'archive' in attribs: 

516 for match in _archive_re.finditer(el.get('archive')): 

517 value = match.group(0) 

518 if codebase is not None: 

519 value = urljoin(codebase, value) 

520 yield (el, 'archive', value, match.start()) 

521 else: 

522 for attrib in link_attrs: 

523 if attrib in attribs: 

524 yield (el, attrib, attribs[attrib], 0) 

525 if tag == 'meta': 

526 http_equiv = attribs.get('http-equiv', '').lower() 

527 if http_equiv == 'refresh': 

528 content = attribs.get('content', '') 

529 match = _parse_meta_refresh_url(content) 

530 url = (match.group('url') if match else content).strip() 

531 # unexpected content means the redirect won't work, but we might 

532 # as well be permissive and return the entire string. 

533 if url: 

534 url, pos = _unquote_match( 

535 url, match.start('url') if match else content.find(url)) 

536 yield (el, 'content', url, pos) 

537 elif tag == 'param': 

538 valuetype = el.get('valuetype') or '' 

539 if valuetype.lower() == 'ref': 

540 ## FIXME: while it's fine we *find* this link, 

541 ## according to the spec we aren't supposed to 

542 ## actually change the value, including resolving 

543 ## it. It can also still be a link, even if it 

544 ## doesn't have a valuetype="ref" (which seems to be the norm) 

545 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 

546 yield (el, 'value', el.get('value'), 0) 

547 elif tag == 'style' and el.text: 

548 urls = [ 

549 # (start_pos, url) 

550 _unquote_match(match.group(1), match.start(1))[::-1] 

551 for match in _iter_css_urls(el.text) 

552 ] + [ 

553 (match.start(1), match.group(1)) 

554 for match in _iter_css_imports(el.text) 

555 ] 

556 if urls: 

557 # sort by start pos to bring both match sets back into order 

558 # and reverse the list to report correct positions despite 

559 # modifications 

560 urls.sort(reverse=True) 

561 for start, url in urls: 

562 yield (el, None, url, start) 

563 if 'style' in attribs: 

564 urls = list(_iter_css_urls(attribs['style'])) 

565 if urls: 

566 # return in reversed order to simplify in-place modifications 

567 for match in urls[::-1]: 

568 url, start = _unquote_match(match.group(1), match.start(1)) 

569 yield (el, 'style', url, start) 

570 

571 def rewrite_links(self, link_repl_func, resolve_base_href=True, 

572 base_href=None): 

573 """ 

574 Rewrite all the links in the document. For each link 

575 ``link_repl_func(link)`` will be called, and the return value 

576 will replace the old link. 

577 

578 Note that links may not be absolute (unless you first called 

579 ``make_links_absolute()``), and may be internal (e.g., 

580 ``'#anchor'``). They can also be values like 

581 ``'mailto:email'`` or ``'javascript:expr'``. 

582 

583 If you give ``base_href`` then all links passed to 

584 ``link_repl_func()`` will take that into account. 

585 

586 If the ``link_repl_func`` returns None, the attribute or 

587 tag text will be removed completely. 

588 """ 

589 if base_href is not None: 

590 # FIXME: this can be done in one pass with a wrapper 

591 # around link_repl_func 

592 self.make_links_absolute( 

593 base_href, resolve_base_href=resolve_base_href) 

594 elif resolve_base_href: 

595 self.resolve_base_href() 

596 

597 for el, attrib, link, pos in self.iterlinks(): 

598 new_link = link_repl_func(link.strip()) 

599 if new_link == link: 

600 continue 

601 if new_link is None: 

602 # Remove the attribute or element content 

603 if attrib is None: 

604 el.text = '' 

605 else: 

606 del el.attrib[attrib] 

607 continue 

608 

609 if attrib is None: 

610 new = el.text[:pos] + new_link + el.text[pos+len(link):] 

611 el.text = new 

612 else: 

613 cur = el.get(attrib) 

614 if not pos and len(cur) == len(link): 

615 new = new_link # most common case 

616 else: 

617 new = cur[:pos] + new_link + cur[pos+len(link):] 

618 el.set(attrib, new) 

619 

620 

621class _MethodFunc: 

622 """ 

623 An object that represents a method on an element as a function; 

624 the function takes either an element or an HTML string. It 

625 returns whatever the function normally returns, or if the function 

626 works in-place (and so returns None) it returns a serialized form 

627 of the resulting document. 

628 """ 

629 def __init__(self, name, copy=False, source_class=HtmlMixin): 

630 self.name = name 

631 self.copy = copy 

632 self.__doc__ = getattr(source_class, self.name).__doc__ 

633 def __call__(self, doc, *args, **kw): 

634 result_type = type(doc) 

635 if isinstance(doc, str): 

636 if 'copy' in kw: 

637 raise TypeError( 

638 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 

639 doc = fromstring(doc, **kw) 

640 else: 

641 if 'copy' in kw: 

642 make_a_copy = kw.pop('copy') 

643 else: 

644 make_a_copy = self.copy 

645 if make_a_copy: 

646 doc = copy.deepcopy(doc) 

647 meth = getattr(doc, self.name) 

648 result = meth(*args, **kw) 

649 # FIXME: this None test is a bit sloppy 

650 if result is None: 

651 # Then return what we got in 

652 return _transform_result(result_type, doc) 

653 else: 

654 return result 

655 

656 

657find_rel_links = _MethodFunc('find_rel_links', copy=False) 

658find_class = _MethodFunc('find_class', copy=False) 

659make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 

660resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 

661iterlinks = _MethodFunc('iterlinks', copy=False) 

662rewrite_links = _MethodFunc('rewrite_links', copy=True) 

663 

664 

665class HtmlComment(HtmlMixin, etree.CommentBase): 

666 pass 

667 

668 

669class HtmlElement(HtmlMixin, etree.ElementBase): 

670 pass 

671 

672 

673class HtmlProcessingInstruction(HtmlMixin, etree.PIBase): 

674 pass 

675 

676 

677class HtmlEntity(HtmlMixin, etree.EntityBase): 

678 pass 

679 

680 

681class HtmlElementClassLookup(etree.CustomElementClassLookup): 

682 """A lookup scheme for HTML Element classes. 

683 

684 To create a lookup instance with different Element classes, pass a tag 

685 name mapping of Element classes in the ``classes`` keyword argument and/or 

686 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 

687 The special key '*' denotes a Mixin class that should be mixed into all 

688 Element classes. 

689 """ 

690 _default_element_classes = {} 

691 

692 def __init__(self, classes=None, mixins=None): 

693 etree.CustomElementClassLookup.__init__(self) 

694 if classes is None: 

695 classes = self._default_element_classes.copy() 

696 if mixins: 

697 mixers = {} 

698 for name, value in mixins: 

699 if name == '*': 

700 for n in classes.keys(): 

701 mixers.setdefault(n, []).append(value) 

702 else: 

703 mixers.setdefault(name, []).append(value) 

704 for name, mix_bases in mixers.items(): 

705 cur = classes.get(name, HtmlElement) 

706 bases = tuple(mix_bases + [cur]) 

707 classes[name] = type(cur.__name__, bases, {}) 

708 self._element_classes = classes 

709 

710 def lookup(self, node_type, document, namespace, name): 

711 if node_type == 'element': 

712 return self._element_classes.get(name.lower(), HtmlElement) 

713 elif node_type == 'comment': 

714 return HtmlComment 

715 elif node_type == 'PI': 

716 return HtmlProcessingInstruction 

717 elif node_type == 'entity': 

718 return HtmlEntity 

719 # Otherwise normal lookup 

720 return None 

721 

722 

723################################################################################ 

724# parsing 

725################################################################################ 

726 

727_looks_like_full_html_unicode = re.compile( 

728 r'^\s*<(?:html|!doctype)', re.I).match 

729_looks_like_full_html_bytes = re.compile( 

730 br'^\s*<(?:html|!doctype)', re.I).match 

731 

732 

733def document_fromstring(html, parser=None, ensure_head_body=False, **kw): 

734 if parser is None: 

735 parser = html_parser 

736 value = etree.fromstring(html, parser, **kw) 

737 if value is None: 

738 raise etree.ParserError( 

739 "Document is empty") 

740 if ensure_head_body and value.find('head') is None: 

741 value.insert(0, Element('head')) 

742 if ensure_head_body and value.find('body') is None: 

743 value.append(Element('body')) 

744 return value 

745 

746 

747def fragments_fromstring(html, no_leading_text=False, base_url=None, 

748 parser=None, **kw): 

749 """Parses several HTML elements, returning a list of elements. 

750 

751 The first item in the list may be a string. 

752 If no_leading_text is true, then it will be an error if there is 

753 leading text, and it will always be a list of only elements. 

754 

755 base_url will set the document's base_url attribute 

756 (and the tree's docinfo.URL). 

757 """ 

758 if parser is None: 

759 parser = html_parser 

760 # FIXME: check what happens when you give html with a body, head, etc. 

761 if isinstance(html, bytes): 

762 if not _looks_like_full_html_bytes(html): 

763 # can't use %-formatting in early Py3 versions 

764 html = (b'<html><body>' + html + 

765 b'</body></html>') 

766 else: 

767 if not _looks_like_full_html_unicode(html): 

768 html = '<html><body>%s</body></html>' % html 

769 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

770 assert _nons(doc.tag) == 'html' 

771 bodies = [e for e in doc if _nons(e.tag) == 'body'] 

772 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 

773 body = bodies[0] 

774 elements = [] 

775 if no_leading_text and body.text and body.text.strip(): 

776 raise etree.ParserError( 

777 "There is leading text: %r" % body.text) 

778 if body.text and body.text.strip(): 

779 elements.append(body.text) 

780 elements.extend(body) 

781 # FIXME: removing the reference to the parent artificial document 

782 # would be nice 

783 return elements 

784 

785 

786def fragment_fromstring(html, create_parent=False, base_url=None, 

787 parser=None, **kw): 

788 """ 

789 Parses a single HTML element; it is an error if there is more than 

790 one element, or if anything but whitespace precedes or follows the 

791 element. 

792 

793 If ``create_parent`` is true (or is a tag name) then a parent node 

794 will be created to encapsulate the HTML in a single element. In this 

795 case, leading or trailing text is also allowed, as are multiple elements 

796 as result of the parsing. 

797 

798 Passing a ``base_url`` will set the document's ``base_url`` attribute 

799 (and the tree's docinfo.URL). 

800 """ 

801 if parser is None: 

802 parser = html_parser 

803 

804 accept_leading_text = bool(create_parent) 

805 

806 elements = fragments_fromstring( 

807 html, parser=parser, no_leading_text=not accept_leading_text, 

808 base_url=base_url, **kw) 

809 

810 if create_parent: 

811 if not isinstance(create_parent, str): 

812 create_parent = 'div' 

813 new_root = Element(create_parent) 

814 if elements: 

815 if isinstance(elements[0], str): 

816 new_root.text = elements[0] 

817 del elements[0] 

818 new_root.extend(elements) 

819 return new_root 

820 

821 if not elements: 

822 raise etree.ParserError('No elements found') 

823 if len(elements) > 1: 

824 raise etree.ParserError( 

825 "Multiple elements found (%s)" 

826 % ', '.join([_element_name(e) for e in elements])) 

827 el = elements[0] 

828 if el.tail and el.tail.strip(): 

829 raise etree.ParserError( 

830 "Element followed by text: %r" % el.tail) 

831 el.tail = None 

832 return el 

833 

834 

835def fromstring(html, base_url=None, parser=None, **kw): 

836 """ 

837 Parse the html, returning a single element/document. 

838 

839 This tries to minimally parse the chunk of text, without knowing if it 

840 is a fragment or a document. 

841 

842 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 

843 """ 

844 if parser is None: 

845 parser = html_parser 

846 if isinstance(html, bytes): 

847 is_full_html = _looks_like_full_html_bytes(html) 

848 else: 

849 is_full_html = _looks_like_full_html_unicode(html) 

850 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

851 if is_full_html: 

852 return doc 

853 # otherwise, lets parse it out... 

854 bodies = doc.findall('body') 

855 if not bodies: 

856 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 

857 if bodies: 

858 body = bodies[0] 

859 if len(bodies) > 1: 

860 # Somehow there are multiple bodies, which is bad, but just 

861 # smash them into one body 

862 for other_body in bodies[1:]: 

863 if other_body.text: 

864 if len(body): 

865 body[-1].tail = (body[-1].tail or '') + other_body.text 

866 else: 

867 body.text = (body.text or '') + other_body.text 

868 body.extend(other_body) 

869 # We'll ignore tail 

870 # I guess we are ignoring attributes too 

871 other_body.drop_tree() 

872 else: 

873 body = None 

874 heads = doc.findall('head') 

875 if not heads: 

876 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 

877 if heads: 

878 # Well, we have some sort of structure, so lets keep it all 

879 head = heads[0] 

880 if len(heads) > 1: 

881 for other_head in heads[1:]: 

882 head.extend(other_head) 

883 # We don't care about text or tail in a head 

884 other_head.drop_tree() 

885 return doc 

886 if body is None: 

887 return doc 

888 if (len(body) == 1 and (not body.text or not body.text.strip()) 

889 and (not body[-1].tail or not body[-1].tail.strip())): 

890 # The body has just one element, so it was probably a single 

891 # element passed in 

892 return body[0] 

893 # Now we have a body which represents a bunch of tags which have the 

894 # content that was passed in. We will create a fake container, which 

895 # is the body tag, except <body> implies too much structure. 

896 if _contains_block_level_tag(body): 

897 body.tag = 'div' 

898 else: 

899 body.tag = 'span' 

900 return body 

901 

902 

903def parse(filename_or_url, parser=None, base_url=None, **kw): 

904 """ 

905 Parse a filename, URL, or file-like object into an HTML document 

906 tree. Note: this returns a tree, not an element. Use 

907 ``parse(...).getroot()`` to get the document root. 

908 

909 You can override the base URL with the ``base_url`` keyword. This 

910 is most useful when parsing from a file-like object. 

911 """ 

912 if parser is None: 

913 parser = html_parser 

914 return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 

915 

916 

917def _contains_block_level_tag(el): 

918 # FIXME: I could do this with XPath, but would that just be 

919 # unnecessarily slow? 

920 for el in el.iter(etree.Element): 

921 if _nons(el.tag) in defs.block_tags: 

922 return True 

923 return False 

924 

925 

926def _element_name(el): 

927 if isinstance(el, etree.CommentBase): 

928 return 'comment' 

929 elif isinstance(el, str): 

930 return 'string' 

931 else: 

932 return _nons(el.tag) 

933 

934 

935################################################################################ 

936# form handling 

937################################################################################ 

938 

939class FormElement(HtmlElement): 

940 """ 

941 Represents a <form> element. 

942 """ 

943 

944 @property 

945 def inputs(self): 

946 """ 

947 Returns an accessor for all the input elements in the form. 

948 

949 See `InputGetter` for more information about the object. 

950 """ 

951 return InputGetter(self) 

952 

953 @property 

954 def fields(self): 

955 """ 

956 Dictionary-like object that represents all the fields in this 

957 form. You can set values in this dictionary to effect the 

958 form. 

959 """ 

960 return FieldsDict(self.inputs) 

961 

962 @fields.setter 

963 def fields(self, value): 

964 fields = self.fields 

965 prev_keys = fields.keys() 

966 for key, value in value.items(): 

967 if key in prev_keys: 

968 prev_keys.remove(key) 

969 fields[key] = value 

970 for key in prev_keys: 

971 if key is None: 

972 # Case of an unnamed input; these aren't really 

973 # expressed in form_values() anyway. 

974 continue 

975 fields[key] = None 

976 

977 def _name(self): 

978 if self.get('name'): 

979 return self.get('name') 

980 elif self.get('id'): 

981 return '#' + self.get('id') 

982 iter_tags = self.body.iter 

983 forms = list(iter_tags('form')) 

984 if not forms: 

985 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 

986 return str(forms.index(self)) 

987 

988 def form_values(self): 

989 """ 

990 Return a list of tuples of the field values for the form. 

991 This is suitable to be passed to ``urllib.urlencode()``. 

992 """ 

993 results = [] 

994 for el in self.inputs: 

995 name = el.name 

996 if not name or 'disabled' in el.attrib: 

997 continue 

998 tag = _nons(el.tag) 

999 if tag == 'textarea': 

1000 results.append((name, el.value)) 

1001 elif tag == 'select': 

1002 value = el.value 

1003 if el.multiple: 

1004 for v in value: 

1005 results.append((name, v)) 

1006 elif value is not None: 

1007 results.append((name, el.value)) 

1008 else: 

1009 assert tag == 'input', ( 

1010 "Unexpected tag: %r" % el) 

1011 if el.checkable and not el.checked: 

1012 continue 

1013 if el.type in ('submit', 'image', 'reset', 'file'): 

1014 continue 

1015 value = el.value 

1016 if value is not None: 

1017 results.append((name, el.value)) 

1018 return results 

1019 

1020 @property 

1021 def action(self): 

1022 """ 

1023 Get/set the form's ``action`` attribute. 

1024 """ 

1025 base_url = self.base_url 

1026 action = self.get('action') 

1027 if base_url and action is not None: 

1028 return urljoin(base_url, action) 

1029 else: 

1030 return action 

1031 

1032 @action.setter 

1033 def action(self, value): 

1034 self.set('action', value) 

1035 

1036 @action.deleter 

1037 def action(self): 

1038 attrib = self.attrib 

1039 if 'action' in attrib: 

1040 del attrib['action'] 

1041 

1042 @property 

1043 def method(self): 

1044 """ 

1045 Get/set the form's method. Always returns a capitalized 

1046 string, and defaults to ``'GET'`` 

1047 """ 

1048 return self.get('method', 'GET').upper() 

1049 

1050 @method.setter 

1051 def method(self, value): 

1052 self.set('method', value.upper()) 

1053 

1054 

1055HtmlElementClassLookup._default_element_classes['form'] = FormElement 

1056 

1057 

1058def submit_form(form, extra_values=None, open_http=None): 

1059 """ 

1060 Helper function to submit a form. Returns a file-like object, as from 

1061 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 

1062 which shows the URL if there were any redirects. 

1063 

1064 You can use this like:: 

1065 

1066 form = doc.forms[0] 

1067 form.inputs['foo'].value = 'bar' # etc 

1068 response = form.submit() 

1069 doc = parse(response) 

1070 doc.make_links_absolute(response.geturl()) 

1071 

1072 To change the HTTP requester, pass a function as ``open_http`` keyword 

1073 argument that opens the URL for you. The function must have the following 

1074 signature:: 

1075 

1076 open_http(method, URL, values) 

1077 

1078 The action is one of 'GET' or 'POST', the URL is the target URL as a 

1079 string, and the values are a sequence of ``(name, value)`` tuples with the 

1080 form data. 

1081 """ 

1082 values = form.form_values() 

1083 if extra_values: 

1084 if hasattr(extra_values, 'items'): 

1085 extra_values = extra_values.items() 

1086 values.extend(extra_values) 

1087 if open_http is None: 

1088 open_http = open_http_urllib 

1089 if form.action: 

1090 url = form.action 

1091 else: 

1092 url = form.base_url 

1093 return open_http(form.method, url, values) 

1094 

1095 

1096def open_http_urllib(method, url, values): 

1097 if not url: 

1098 raise ValueError("cannot submit, no URL provided") 

1099 ## FIXME: should test that it's not a relative URL or something 

1100 try: 

1101 from urllib import urlencode, urlopen 

1102 except ImportError: # Python 3 

1103 from urllib.request import urlopen 

1104 from urllib.parse import urlencode 

1105 if method == 'GET': 

1106 if '?' in url: 

1107 url += '&' 

1108 else: 

1109 url += '?' 

1110 url += urlencode(values) 

1111 data = None 

1112 else: 

1113 data = urlencode(values) 

1114 if not isinstance(data, bytes): 

1115 data = data.encode('ASCII') 

1116 return urlopen(url, data) 

1117 

1118 

1119class FieldsDict(MutableMapping): 

1120 

1121 def __init__(self, inputs): 

1122 self.inputs = inputs 

1123 def __getitem__(self, item): 

1124 return self.inputs[item].value 

1125 def __setitem__(self, item, value): 

1126 self.inputs[item].value = value 

1127 def __delitem__(self, item): 

1128 raise KeyError( 

1129 "You cannot remove keys from ElementDict") 

1130 def keys(self): 

1131 return self.inputs.keys() 

1132 def __contains__(self, item): 

1133 return item in self.inputs 

1134 def __iter__(self): 

1135 return iter(self.inputs.keys()) 

1136 def __len__(self): 

1137 return len(self.inputs) 

1138 

1139 def __repr__(self): 

1140 return '<%s for form %s>' % ( 

1141 self.__class__.__name__, 

1142 self.inputs.form._name()) 

1143 

1144 

1145class InputGetter: 

1146 

1147 """ 

1148 An accessor that represents all the input fields in a form. 

1149 

1150 You can get fields by name from this, with 

1151 ``form.inputs['field_name']``. If there are a set of checkboxes 

1152 with the same name, they are returned as a list (a `CheckboxGroup` 

1153 which also allows value setting). Radio inputs are handled 

1154 similarly. Use ``.keys()`` and ``.items()`` to process all fields 

1155 in this way. 

1156 

1157 You can also iterate over this to get all input elements. This 

1158 won't return the same thing as if you get all the names, as 

1159 checkboxes and radio elements are returned individually. 

1160 """ 

1161 

1162 def __init__(self, form): 

1163 self.form = form 

1164 

1165 def __repr__(self): 

1166 return '<%s for form %s>' % ( 

1167 self.__class__.__name__, 

1168 self.form._name()) 

1169 

1170 ## FIXME: there should be more methods, and it's unclear if this is 

1171 ## a dictionary-like object or list-like object 

1172 

1173 def __getitem__(self, name): 

1174 fields = [field for field in self if field.name == name] 

1175 if not fields: 

1176 raise KeyError("No input element with the name %r" % name) 

1177 

1178 input_type = fields[0].get('type') 

1179 if input_type == 'radio' and len(fields) > 1: 

1180 group = RadioGroup(fields) 

1181 group.name = name 

1182 return group 

1183 elif input_type == 'checkbox' and len(fields) > 1: 

1184 group = CheckboxGroup(fields) 

1185 group.name = name 

1186 return group 

1187 else: 

1188 # I don't like throwing away elements like this 

1189 return fields[0] 

1190 

1191 def __contains__(self, name): 

1192 for field in self: 

1193 if field.name == name: 

1194 return True 

1195 return False 

1196 

1197 def keys(self): 

1198 """ 

1199 Returns all unique field names, in document order. 

1200 

1201 :return: A list of all unique field names. 

1202 """ 

1203 names = [] 

1204 seen = {None} 

1205 for el in self: 

1206 name = el.name 

1207 if name not in seen: 

1208 names.append(name) 

1209 seen.add(name) 

1210 return names 

1211 

1212 def items(self): 

1213 """ 

1214 Returns all fields with their names, similar to dict.items(). 

1215 

1216 :return: A list of (name, field) tuples. 

1217 """ 

1218 items = [] 

1219 seen = set() 

1220 for el in self: 

1221 name = el.name 

1222 if name not in seen: 

1223 seen.add(name) 

1224 items.append((name, self[name])) 

1225 return items 

1226 

1227 def __iter__(self): 

1228 return self.form.iter('select', 'input', 'textarea') 

1229 

1230 def __len__(self): 

1231 return sum(1 for _ in self) 

1232 

1233 

1234class InputMixin: 

1235 """ 

1236 Mix-in for all input elements (input, select, and textarea) 

1237 """ 

1238 @property 

1239 def name(self): 

1240 """ 

1241 Get/set the name of the element 

1242 """ 

1243 return self.get('name') 

1244 

1245 @name.setter 

1246 def name(self, value): 

1247 self.set('name', value) 

1248 

1249 @name.deleter 

1250 def name(self): 

1251 attrib = self.attrib 

1252 if 'name' in attrib: 

1253 del attrib['name'] 

1254 

1255 def __repr__(self): 

1256 type_name = getattr(self, 'type', None) 

1257 if type_name: 

1258 type_name = ' type=%r' % type_name 

1259 else: 

1260 type_name = '' 

1261 return '<%s %x name=%r%s>' % ( 

1262 self.__class__.__name__, id(self), self.name, type_name) 

1263 

1264 

1265class TextareaElement(InputMixin, HtmlElement): 

1266 """ 

1267 ``<textarea>`` element. You can get the name with ``.name`` and 

1268 get/set the value with ``.value`` 

1269 """ 

1270 @property 

1271 def value(self): 

1272 """ 

1273 Get/set the value (which is the contents of this element) 

1274 """ 

1275 content = self.text or '' 

1276 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 

1277 serialisation_method = 'xml' 

1278 else: 

1279 serialisation_method = 'html' 

1280 for el in self: 

1281 # it's rare that we actually get here, so let's not use ''.join() 

1282 content += etree.tostring( 

1283 el, method=serialisation_method, encoding='unicode') 

1284 return content 

1285 

1286 @value.setter 

1287 def value(self, value): 

1288 del self[:] 

1289 self.text = value 

1290 

1291 @value.deleter 

1292 def value(self): 

1293 self.text = '' 

1294 del self[:] 

1295 

1296 

1297HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 

1298 

1299 

1300class SelectElement(InputMixin, HtmlElement): 

1301 """ 

1302 ``<select>`` element. You can get the name with ``.name``. 

1303 

1304 ``.value`` will be the value of the selected option, unless this 

1305 is a multi-select element (``<select multiple>``), in which case 

1306 it will be a set-like object. In either case ``.value_options`` 

1307 gives the possible values. 

1308 

1309 The boolean attribute ``.multiple`` shows if this is a 

1310 multi-select. 

1311 """ 

1312 @property 

1313 def value(self): 

1314 """ 

1315 Get/set the value of this select (the selected option). 

1316 

1317 If this is a multi-select, this is a set-like object that 

1318 represents all the selected options. 

1319 """ 

1320 if self.multiple: 

1321 return MultipleSelectOptions(self) 

1322 options = _options_xpath(self) 

1323 

1324 try: 

1325 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 

1326 except StopIteration: 

1327 try: 

1328 selected_option = next(el for el in options if el.get('disabled') is None) 

1329 except StopIteration: 

1330 return None 

1331 value = selected_option.get('value') 

1332 if value is None: 

1333 value = (selected_option.text or '').strip() 

1334 return value 

1335 

1336 @value.setter 

1337 def value(self, value): 

1338 if self.multiple: 

1339 if isinstance(value, str): 

1340 raise TypeError("You must pass in a sequence") 

1341 values = self.value 

1342 values.clear() 

1343 values.update(value) 

1344 return 

1345 checked_option = None 

1346 if value is not None: 

1347 for el in _options_xpath(self): 

1348 opt_value = el.get('value') 

1349 if opt_value is None: 

1350 opt_value = (el.text or '').strip() 

1351 if opt_value == value: 

1352 checked_option = el 

1353 break 

1354 else: 

1355 raise ValueError( 

1356 "There is no option with the value of %r" % value) 

1357 for el in _options_xpath(self): 

1358 if 'selected' in el.attrib: 

1359 del el.attrib['selected'] 

1360 if checked_option is not None: 

1361 checked_option.set('selected', '') 

1362 

1363 @value.deleter 

1364 def value(self): 

1365 # FIXME: should del be allowed at all? 

1366 if self.multiple: 

1367 self.value.clear() 

1368 else: 

1369 self.value = None 

1370 

1371 @property 

1372 def value_options(self): 

1373 """ 

1374 All the possible values this select can have (the ``value`` 

1375 attribute of all the ``<option>`` elements. 

1376 """ 

1377 options = [] 

1378 for el in _options_xpath(self): 

1379 value = el.get('value') 

1380 if value is None: 

1381 value = (el.text or '').strip() 

1382 options.append(value) 

1383 return options 

1384 

1385 @property 

1386 def multiple(self): 

1387 """ 

1388 Boolean attribute: is there a ``multiple`` attribute on this element. 

1389 """ 

1390 return 'multiple' in self.attrib 

1391 

1392 @multiple.setter 

1393 def multiple(self, value): 

1394 if value: 

1395 self.set('multiple', '') 

1396 elif 'multiple' in self.attrib: 

1397 del self.attrib['multiple'] 

1398 

1399 

1400HtmlElementClassLookup._default_element_classes['select'] = SelectElement 

1401 

1402 

1403class MultipleSelectOptions(SetMixin): 

1404 """ 

1405 Represents all the selected options in a ``<select multiple>`` element. 

1406 

1407 You can add to this set-like option to select an option, or remove 

1408 to unselect the option. 

1409 """ 

1410 

1411 def __init__(self, select): 

1412 self.select = select 

1413 

1414 @property 

1415 def options(self): 

1416 """ 

1417 Iterator of all the ``<option>`` elements. 

1418 """ 

1419 return iter(_options_xpath(self.select)) 

1420 

1421 def __iter__(self): 

1422 for option in self.options: 

1423 if 'selected' in option.attrib: 

1424 opt_value = option.get('value') 

1425 if opt_value is None: 

1426 opt_value = (option.text or '').strip() 

1427 yield opt_value 

1428 

1429 def add(self, item): 

1430 for option in self.options: 

1431 opt_value = option.get('value') 

1432 if opt_value is None: 

1433 opt_value = (option.text or '').strip() 

1434 if opt_value == item: 

1435 option.set('selected', '') 

1436 break 

1437 else: 

1438 raise ValueError( 

1439 "There is no option with the value %r" % item) 

1440 

1441 def remove(self, item): 

1442 for option in self.options: 

1443 opt_value = option.get('value') 

1444 if opt_value is None: 

1445 opt_value = (option.text or '').strip() 

1446 if opt_value == item: 

1447 if 'selected' in option.attrib: 

1448 del option.attrib['selected'] 

1449 else: 

1450 raise ValueError( 

1451 "The option %r is not currently selected" % item) 

1452 break 

1453 else: 

1454 raise ValueError( 

1455 "There is not option with the value %r" % item) 

1456 

1457 def __repr__(self): 

1458 return '<%s {%s} for select name=%r>' % ( 

1459 self.__class__.__name__, 

1460 ', '.join([repr(v) for v in self]), 

1461 self.select.name) 

1462 

1463 

1464class RadioGroup(list): 

1465 """ 

1466 This object represents several ``<input type=radio>`` elements 

1467 that have the same name. 

1468 

1469 You can use this like a list, but also use the property 

1470 ``.value`` to check/uncheck inputs. Also you can use 

1471 ``.value_options`` to get the possible values. 

1472 """ 

1473 @property 

1474 def value(self): 

1475 """ 

1476 Get/set the value, which checks the radio with that value (and 

1477 unchecks any other value). 

1478 """ 

1479 for el in self: 

1480 if 'checked' in el.attrib: 

1481 return el.get('value') 

1482 return None 

1483 

1484 @value.setter 

1485 def value(self, value): 

1486 checked_option = None 

1487 if value is not None: 

1488 for el in self: 

1489 if el.get('value') == value: 

1490 checked_option = el 

1491 break 

1492 else: 

1493 raise ValueError("There is no radio input with the value %r" % value) 

1494 for el in self: 

1495 if 'checked' in el.attrib: 

1496 del el.attrib['checked'] 

1497 if checked_option is not None: 

1498 checked_option.set('checked', '') 

1499 

1500 @value.deleter 

1501 def value(self): 

1502 self.value = None 

1503 

1504 @property 

1505 def value_options(self): 

1506 """ 

1507 Returns a list of all the possible values. 

1508 """ 

1509 return [el.get('value') for el in self] 

1510 

1511 def __repr__(self): 

1512 return '%s(%s)' % ( 

1513 self.__class__.__name__, 

1514 list.__repr__(self)) 

1515 

1516 

1517class CheckboxGroup(list): 

1518 """ 

1519 Represents a group of checkboxes (``<input type=checkbox>``) that 

1520 have the same name. 

1521 

1522 In addition to using this like a list, the ``.value`` attribute 

1523 returns a set-like object that you can add to or remove from to 

1524 check and uncheck checkboxes. You can also use ``.value_options`` 

1525 to get the possible values. 

1526 """ 

1527 @property 

1528 def value(self): 

1529 """ 

1530 Return a set-like object that can be modified to check or 

1531 uncheck individual checkboxes according to their value. 

1532 """ 

1533 return CheckboxValues(self) 

1534 

1535 @value.setter 

1536 def value(self, value): 

1537 values = self.value 

1538 values.clear() 

1539 if not hasattr(value, '__iter__'): 

1540 raise ValueError( 

1541 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 

1542 % (self[0].name, value)) 

1543 values.update(value) 

1544 

1545 @value.deleter 

1546 def value(self): 

1547 self.value.clear() 

1548 

1549 @property 

1550 def value_options(self): 

1551 """ 

1552 Returns a list of all the possible values. 

1553 """ 

1554 return [el.get('value') for el in self] 

1555 

1556 def __repr__(self): 

1557 return '%s(%s)' % ( 

1558 self.__class__.__name__, list.__repr__(self)) 

1559 

1560 

1561class CheckboxValues(SetMixin): 

1562 """ 

1563 Represents the values of the checked checkboxes in a group of 

1564 checkboxes with the same name. 

1565 """ 

1566 

1567 def __init__(self, group): 

1568 self.group = group 

1569 

1570 def __iter__(self): 

1571 return iter([ 

1572 el.get('value') 

1573 for el in self.group 

1574 if 'checked' in el.attrib]) 

1575 

1576 def add(self, value): 

1577 for el in self.group: 

1578 if el.get('value') == value: 

1579 el.set('checked', '') 

1580 break 

1581 else: 

1582 raise KeyError("No checkbox with value %r" % value) 

1583 

1584 def remove(self, value): 

1585 for el in self.group: 

1586 if el.get('value') == value: 

1587 if 'checked' in el.attrib: 

1588 del el.attrib['checked'] 

1589 else: 

1590 raise KeyError( 

1591 "The checkbox with value %r was already unchecked" % value) 

1592 break 

1593 else: 

1594 raise KeyError( 

1595 "No checkbox with value %r" % value) 

1596 

1597 def __repr__(self): 

1598 return '<%s {%s} for checkboxes name=%r>' % ( 

1599 self.__class__.__name__, 

1600 ', '.join([repr(v) for v in self]), 

1601 self.group.name) 

1602 

1603 

1604class InputElement(InputMixin, HtmlElement): 

1605 """ 

1606 Represents an ``<input>`` element. 

1607 

1608 You can get the type with ``.type`` (which is lower-cased and 

1609 defaults to ``'text'``). 

1610 

1611 Also you can get and set the value with ``.value`` 

1612 

1613 Checkboxes and radios have the attribute ``input.checkable == 

1614 True`` (for all others it is false) and a boolean attribute 

1615 ``.checked``. 

1616 

1617 """ 

1618 

1619 ## FIXME: I'm a little uncomfortable with the use of .checked 

1620 @property 

1621 def value(self): 

1622 """ 

1623 Get/set the value of this element, using the ``value`` attribute. 

1624 

1625 Also, if this is a checkbox and it has no value, this defaults 

1626 to ``'on'``. If it is a checkbox or radio that is not 

1627 checked, this returns None. 

1628 """ 

1629 if self.checkable: 

1630 if self.checked: 

1631 return self.get('value') or 'on' 

1632 else: 

1633 return None 

1634 return self.get('value') 

1635 

1636 @value.setter 

1637 def value(self, value): 

1638 if self.checkable: 

1639 if not value: 

1640 self.checked = False 

1641 else: 

1642 self.checked = True 

1643 if isinstance(value, str): 

1644 self.set('value', value) 

1645 else: 

1646 self.set('value', value) 

1647 

1648 @value.deleter 

1649 def value(self): 

1650 if self.checkable: 

1651 self.checked = False 

1652 else: 

1653 if 'value' in self.attrib: 

1654 del self.attrib['value'] 

1655 

1656 @property 

1657 def type(self): 

1658 """ 

1659 Return the type of this element (using the type attribute). 

1660 """ 

1661 return self.get('type', 'text').lower() 

1662 

1663 @type.setter 

1664 def type(self, value): 

1665 self.set('type', value) 

1666 

1667 @property 

1668 def checkable(self): 

1669 """ 

1670 Boolean: can this element be checked? 

1671 """ 

1672 return self.type in ('checkbox', 'radio') 

1673 

1674 @property 

1675 def checked(self): 

1676 """ 

1677 Boolean attribute to get/set the presence of the ``checked`` 

1678 attribute. 

1679 

1680 You can only use this on checkable input types. 

1681 """ 

1682 if not self.checkable: 

1683 raise AttributeError('Not a checkable input type') 

1684 return 'checked' in self.attrib 

1685 

1686 @checked.setter 

1687 def checked(self, value): 

1688 if not self.checkable: 

1689 raise AttributeError('Not a checkable input type') 

1690 if value: 

1691 self.set('checked', '') 

1692 else: 

1693 attrib = self.attrib 

1694 if 'checked' in attrib: 

1695 del attrib['checked'] 

1696 

1697 

1698HtmlElementClassLookup._default_element_classes['input'] = InputElement 

1699 

1700 

1701class LabelElement(HtmlElement): 

1702 """ 

1703 Represents a ``<label>`` element. 

1704 

1705 Label elements are linked to other elements with their ``for`` 

1706 attribute. You can access this element with ``label.for_element``. 

1707 """ 

1708 @property 

1709 def for_element(self): 

1710 """ 

1711 Get/set the element this label points to. Return None if it 

1712 can't be found. 

1713 """ 

1714 id = self.get('for') 

1715 if not id: 

1716 return None 

1717 return self.body.get_element_by_id(id) 

1718 

1719 @for_element.setter 

1720 def for_element(self, other): 

1721 id = other.get('id') 

1722 if not id: 

1723 raise TypeError( 

1724 "Element %r has no id attribute" % other) 

1725 self.set('for', id) 

1726 

1727 @for_element.deleter 

1728 def for_element(self): 

1729 attrib = self.attrib 

1730 if 'id' in attrib: 

1731 del attrib['id'] 

1732 

1733 

1734HtmlElementClassLookup._default_element_classes['label'] = LabelElement 

1735 

1736 

1737############################################################ 

1738## Serialization 

1739############################################################ 

1740 

1741def html_to_xhtml(html): 

1742 """Convert all tags in an HTML tree to XHTML by moving them to the 

1743 XHTML namespace. 

1744 """ 

1745 try: 

1746 html = html.getroot() 

1747 except AttributeError: 

1748 pass 

1749 prefix = "{%s}" % XHTML_NAMESPACE 

1750 for el in html.iter(etree.Element): 

1751 tag = el.tag 

1752 if tag[0] != '{': 

1753 el.tag = prefix + tag 

1754 

1755 

1756def xhtml_to_html(xhtml): 

1757 """Convert all tags in an XHTML tree to HTML by removing their 

1758 XHTML namespace. 

1759 """ 

1760 try: 

1761 xhtml = xhtml.getroot() 

1762 except AttributeError: 

1763 pass 

1764 prefix = "{%s}" % XHTML_NAMESPACE 

1765 prefix_len = len(prefix) 

1766 for el in xhtml.iter(prefix + "*"): 

1767 el.tag = el.tag[prefix_len:] 

1768 

1769 

1770# This isn't a general match, but it's a match for what libxml2 

1771# specifically serialises: 

1772__str_replace_meta_content_type = re.compile( 

1773 r'<meta http-equiv="Content-Type"[^>]*>').sub 

1774__bytes_replace_meta_content_type = re.compile( 

1775 br'<meta http-equiv="Content-Type"[^>]*>').sub 

1776 

1777 

1778def tostring(doc, pretty_print=False, include_meta_content_type=False, 

1779 encoding=None, method="html", with_tail=True, doctype=None): 

1780 """Return an HTML string representation of the document. 

1781 

1782 Note: if include_meta_content_type is true this will create a 

1783 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 

1784 regardless of the value of include_meta_content_type any existing 

1785 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 

1786 

1787 The ``encoding`` argument controls the output encoding (defaults to 

1788 ASCII, with &#...; character references for any characters outside 

1789 of ASCII). Note that you can pass the name ``'unicode'`` as 

1790 ``encoding`` argument to serialise to a Unicode string. 

1791 

1792 The ``method`` argument defines the output method. It defaults to 

1793 'html', but can also be 'xml' for xhtml output, or 'text' to 

1794 serialise to plain text without markup. 

1795 

1796 To leave out the tail text of the top-level element that is being 

1797 serialised, pass ``with_tail=False``. 

1798 

1799 The ``doctype`` option allows passing in a plain string that will 

1800 be serialised before the XML tree. Note that passing in non 

1801 well-formed content here will make the XML output non well-formed. 

1802 Also, an existing doctype in the document tree will not be removed 

1803 when serialising an ElementTree instance. 

1804 

1805 Example:: 

1806 

1807 >>> from lxml import html 

1808 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 

1809 

1810 >>> html.tostring(root) 

1811 b'<p>Hello<br>world!</p>' 

1812 >>> html.tostring(root, method='html') 

1813 b'<p>Hello<br>world!</p>' 

1814 

1815 >>> html.tostring(root, method='xml') 

1816 b'<p>Hello<br/>world!</p>' 

1817 

1818 >>> html.tostring(root, method='text') 

1819 b'Helloworld!' 

1820 

1821 >>> html.tostring(root, method='text', encoding='unicode') 

1822 u'Helloworld!' 

1823 

1824 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 

1825 >>> html.tostring(root[0], method='text', encoding='unicode') 

1826 u'Helloworld!TAIL' 

1827 

1828 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 

1829 u'Helloworld!' 

1830 

1831 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 

1832 >>> html.tostring(doc, method='html', encoding='unicode') 

1833 u'<html><body><p>Hello<br>world!</p></body></html>' 

1834 

1835 >>> print(html.tostring(doc, method='html', encoding='unicode', 

1836 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 

1837 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 

1838 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 

1839 <html><body><p>Hello<br>world!</p></body></html> 

1840 """ 

1841 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 

1842 encoding=encoding, with_tail=with_tail, 

1843 doctype=doctype) 

1844 if method == 'html' and not include_meta_content_type: 

1845 if isinstance(html, str): 

1846 html = __str_replace_meta_content_type('', html) 

1847 else: 

1848 html = __bytes_replace_meta_content_type(b'', html) 

1849 return html 

1850 

1851 

1852tostring.__doc__ = __fix_docstring(tostring.__doc__) 

1853 

1854 

1855def open_in_browser(doc, encoding=None): 

1856 """ 

1857 Open the HTML document in a web browser, saving it to a temporary 

1858 file to open it. Note that this does not delete the file after 

1859 use. This is mainly meant for debugging. 

1860 """ 

1861 import os 

1862 import webbrowser 

1863 import tempfile 

1864 if not isinstance(doc, etree._ElementTree): 

1865 doc = etree.ElementTree(doc) 

1866 handle, fn = tempfile.mkstemp(suffix='.html') 

1867 f = os.fdopen(handle, 'wb') 

1868 try: 

1869 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 

1870 finally: 

1871 # we leak the file itself here, but we should at least close it 

1872 f.close() 

1873 url = 'file://' + fn.replace(os.path.sep, '/') 

1874 print(url) 

1875 webbrowser.open(url) 

1876 

1877 

1878################################################################################ 

1879# configure Element class lookup 

1880################################################################################ 

1881 

1882class HTMLParser(etree.HTMLParser): 

1883 """An HTML parser that is configured to return lxml.html Element 

1884 objects. 

1885 """ 

1886 def __init__(self, **kwargs): 

1887 super().__init__(**kwargs) 

1888 self.set_element_class_lookup(HtmlElementClassLookup()) 

1889 

1890 

1891class XHTMLParser(etree.XMLParser): 

1892 """An XML parser that is configured to return lxml.html Element 

1893 objects. 

1894 

1895 Note that this parser is not really XHTML aware unless you let it 

1896 load a DTD that declares the HTML entities. To do this, make sure 

1897 you have the XHTML DTDs installed in your catalogs, and create the 

1898 parser like this:: 

1899 

1900 >>> parser = XHTMLParser(load_dtd=True) 

1901 

1902 If you additionally want to validate the document, use this:: 

1903 

1904 >>> parser = XHTMLParser(dtd_validation=True) 

1905 

1906 For catalog support, see http://www.xmlsoft.org/catalog.html. 

1907 """ 

1908 def __init__(self, **kwargs): 

1909 super().__init__(**kwargs) 

1910 self.set_element_class_lookup(HtmlElementClassLookup()) 

1911 

1912 

1913def Element(*args, **kw): 

1914 """Create a new HTML Element. 

1915 

1916 This can also be used for XHTML documents. 

1917 """ 

1918 v = html_parser.makeelement(*args, **kw) 

1919 return v 

1920 

1921 

1922html_parser = HTMLParser() 

1923xhtml_parser = XHTMLParser()