Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lxml/html/__init__.py: 3%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

972 statements  

1# Copyright (c) 2004 Ian Bicking. All rights reserved. 

2# 

3# Redistribution and use in source and binary forms, with or without 

4# modification, are permitted provided that the following conditions are 

5# met: 

6# 

7# 1. Redistributions of source code must retain the above copyright 

8# notice, this list of conditions and the following disclaimer. 

9# 

10# 2. Redistributions in binary form must reproduce the above copyright 

11# notice, this list of conditions and the following disclaimer in 

12# the documentation and/or other materials provided with the 

13# distribution. 

14# 

15# 3. Neither the name of Ian Bicking nor the names of its contributors may 

16# be used to endorse or promote products derived from this software 

17# without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

30 

31"""The ``lxml.html`` tool set for HTML handling. 

32""" 

33 

34 

35__all__ = [ 

36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 

37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 

38 'find_rel_links', 'find_class', 'make_links_absolute', 

39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] 

40 

41 

42import copy 

43import re 

44 

45from collections.abc import MutableMapping, MutableSet 

46from functools import partial 

47from urllib.parse import urljoin 

48 

49from .. import etree 

50from . import defs 

51from ._setmixin import SetMixin 

52 

53 

54def __fix_docstring(s): 

55 # TODO: remove and clean up doctests 

56 if not s: 

57 return s 

58 sub = re.compile(r"^(\s*)u'", re.M).sub 

59 return sub(r"\1'", s) 

60 

61 

62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 

63 

64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 

65 namespaces={'x':XHTML_NAMESPACE}) 

66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 

67 namespaces={'x':XHTML_NAMESPACE}) 

68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 

69 namespaces={'x':XHTML_NAMESPACE}) 

70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 

71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 

72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 

73_collect_string_content = etree.XPath("string()", smart_strings=False) 

74_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 

75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer 

76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 

77 namespaces={'x':XHTML_NAMESPACE}) 

78_archive_re = re.compile(r'[^ ]+') 

79_parse_meta_refresh_url = re.compile( 

80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 

81 

82 

83def _unquote_match(s, pos): 

84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 

85 return s[1:-1], pos+1 

86 else: 

87 return s,pos 

88 

89 

90def _transform_result(typ, result): 

91 """Convert the result back into the input type. 

92 """ 

93 if issubclass(typ, bytes): 

94 return tostring(result, encoding='utf-8') 

95 elif issubclass(typ, str): 

96 return tostring(result, encoding='unicode') 

97 else: 

98 return result 

99 

100 

101def _nons(tag): 

102 if isinstance(tag, str): 

103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 

104 return tag.split('}')[-1] 

105 return tag 

106 

107 

108class Classes(MutableSet): 

109 """Provides access to an element's class attribute as a set-like collection. 

110 Usage:: 

111 

112 >>> el = fromstring('<p class="hidden large">Text</p>') 

113 >>> classes = el.classes # or: classes = Classes(el.attrib) 

114 >>> classes |= ['block', 'paragraph'] 

115 >>> el.get('class') 

116 'hidden large block paragraph' 

117 >>> classes.toggle('hidden') 

118 False 

119 >>> el.get('class') 

120 'large block paragraph' 

121 >>> classes -= ('some', 'classes', 'block') 

122 >>> el.get('class') 

123 'large paragraph' 

124 """ 

125 def __init__(self, attributes): 

126 self._attributes = attributes 

127 self._get_class_value = partial(attributes.get, 'class', '') 

128 

129 def add(self, value): 

130 """ 

131 Add a class. 

132 

133 This has no effect if the class is already present. 

134 """ 

135 if not value or re.search(r'\s', value): 

136 raise ValueError("Invalid class name: %r" % value) 

137 classes = self._get_class_value().split() 

138 if value in classes: 

139 return 

140 classes.append(value) 

141 self._attributes['class'] = ' '.join(classes) 

142 

143 def discard(self, value): 

144 """ 

145 Remove a class if it is currently present. 

146 

147 If the class is not present, do nothing. 

148 """ 

149 if not value or re.search(r'\s', value): 

150 raise ValueError("Invalid class name: %r" % value) 

151 classes = [name for name in self._get_class_value().split() 

152 if name != value] 

153 if classes: 

154 self._attributes['class'] = ' '.join(classes) 

155 elif 'class' in self._attributes: 

156 del self._attributes['class'] 

157 

158 def remove(self, value): 

159 """ 

160 Remove a class; it must currently be present. 

161 

162 If the class is not present, raise a KeyError. 

163 """ 

164 if not value or re.search(r'\s', value): 

165 raise ValueError("Invalid class name: %r" % value) 

166 super().remove(value) 

167 

168 def __contains__(self, name): 

169 classes = self._get_class_value() 

170 return name in classes and name in classes.split() 

171 

172 def __iter__(self): 

173 return iter(self._get_class_value().split()) 

174 

175 def __len__(self): 

176 return len(self._get_class_value().split()) 

177 

178 # non-standard methods 

179 

180 def update(self, values): 

181 """ 

182 Add all names from 'values'. 

183 """ 

184 classes = self._get_class_value().split() 

185 extended = False 

186 for value in values: 

187 if value not in classes: 

188 classes.append(value) 

189 extended = True 

190 if extended: 

191 self._attributes['class'] = ' '.join(classes) 

192 

193 def toggle(self, value): 

194 """ 

195 Add a class name if it isn't there yet, or remove it if it exists. 

196 

197 Returns true if the class was added (and is now enabled) and 

198 false if it was removed (and is now disabled). 

199 """ 

200 if not value or re.search(r'\s', value): 

201 raise ValueError("Invalid class name: %r" % value) 

202 classes = self._get_class_value().split() 

203 try: 

204 classes.remove(value) 

205 enabled = False 

206 except ValueError: 

207 classes.append(value) 

208 enabled = True 

209 if classes: 

210 self._attributes['class'] = ' '.join(classes) 

211 else: 

212 del self._attributes['class'] 

213 return enabled 

214 

215 

216class HtmlMixin: 

217 

218 def set(self, key, value=None): 

219 """set(self, key, value=None) 

220 

221 Sets an element attribute. If no value is provided, or if the value is None, 

222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 

223 for ``form.set('novalidate')``. 

224 """ 

225 super().set(key, value) 

226 

227 @property 

228 def classes(self): 

229 """ 

230 A set-like wrapper around the 'class' attribute. 

231 """ 

232 return Classes(self.attrib) 

233 

234 @classes.setter 

235 def classes(self, classes): 

236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 

237 value = classes._get_class_value() 

238 if value: 

239 self.set('class', value) 

240 elif self.get('class') is not None: 

241 del self.attrib['class'] 

242 

243 @property 

244 def base_url(self): 

245 """ 

246 Returns the base URL, given when the page was parsed. 

247 

248 Use with ``urlparse.urljoin(el.base_url, href)`` to get 

249 absolute URLs. 

250 """ 

251 return self.getroottree().docinfo.URL 

252 

253 @property 

254 def forms(self): 

255 """ 

256 Return a list of all the forms 

257 """ 

258 return _forms_xpath(self) 

259 

260 @property 

261 def body(self): 

262 """ 

263 Return the <body> element. Can be called from a child element 

264 to get the document's head. 

265 """ 

266 for element in self.getroottree().iter("body", f"{{{XHTML_NAMESPACE}}}body"): 

267 return element 

268 return None 

269 

270 @property 

271 def head(self): 

272 """ 

273 Returns the <head> element. Can be called from a child 

274 element to get the document's head. 

275 """ 

276 for element in self.getroottree().iter("head", f"{{{XHTML_NAMESPACE}}}head"): 

277 return element 

278 return None 

279 

280 @property 

281 def label(self): 

282 """ 

283 Get or set any <label> element associated with this element. 

284 """ 

285 id = self.get('id') 

286 if not id: 

287 return None 

288 result = _label_xpath(self, id=id) 

289 if not result: 

290 return None 

291 else: 

292 return result[0] 

293 

294 @label.setter 

295 def label(self, label): 

296 id = self.get('id') 

297 if not id: 

298 raise TypeError( 

299 "You cannot set a label for an element (%r) that has no id" 

300 % self) 

301 if _nons(label.tag) != 'label': 

302 raise TypeError( 

303 "You can only assign label to a label element (not %r)" 

304 % label) 

305 label.set('for', id) 

306 

307 @label.deleter 

308 def label(self): 

309 label = self.label 

310 if label is not None: 

311 del label.attrib['for'] 

312 

313 def drop_tree(self): 

314 """ 

315 Removes this element from the tree, including its children and 

316 text. The tail text is joined to the previous element or 

317 parent. 

318 """ 

319 parent = self.getparent() 

320 assert parent is not None 

321 if self.tail: 

322 previous = self.getprevious() 

323 if previous is None: 

324 parent.text = (parent.text or '') + self.tail 

325 else: 

326 previous.tail = (previous.tail or '') + self.tail 

327 parent.remove(self) 

328 

329 def drop_tag(self): 

330 """ 

331 Remove the tag, but not its children or text. The children and text 

332 are merged into the parent. 

333 

334 Example:: 

335 

336 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 

337 >>> h.find('.//b').drop_tag() 

338 >>> print(tostring(h, encoding='unicode')) 

339 <div>Hello World!</div> 

340 """ 

341 parent = self.getparent() 

342 assert parent is not None 

343 previous = self.getprevious() 

344 if self.text and isinstance(self.tag, str): 

345 # not a Comment, etc. 

346 if previous is None: 

347 parent.text = (parent.text or '') + self.text 

348 else: 

349 previous.tail = (previous.tail or '') + self.text 

350 if self.tail: 

351 if len(self): 

352 last = self[-1] 

353 last.tail = (last.tail or '') + self.tail 

354 elif previous is None: 

355 parent.text = (parent.text or '') + self.tail 

356 else: 

357 previous.tail = (previous.tail or '') + self.tail 

358 index = parent.index(self) 

359 parent[index:index+1] = self[:] 

360 

361 def find_rel_links(self, rel): 

362 """ 

363 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 

364 """ 

365 rel = rel.lower() 

366 return [el for el in _rel_links_xpath(self) 

367 if el.get('rel').lower() == rel] 

368 

369 def find_class(self, class_name): 

370 """ 

371 Find any elements with the given class name. 

372 """ 

373 return _class_xpath(self, class_name=class_name) 

374 

375 def get_element_by_id(self, id, *default): 

376 """ 

377 Get the first element in a document with the given id. If none is 

378 found, return the default argument if provided or raise KeyError 

379 otherwise. 

380 

381 Note that there can be more than one element with the same id, 

382 and this isn't uncommon in HTML documents found in the wild. 

383 Browsers return only the first match, and this function does 

384 the same. 

385 """ 

386 try: 

387 # FIXME: should this check for multiple matches? 

388 # browsers just return the first one 

389 return _id_xpath(self, id=id)[0] 

390 except IndexError: 

391 if default: 

392 return default[0] 

393 else: 

394 raise KeyError(id) 

395 

396 def text_content(self): 

397 """ 

398 Return the text content of the tag (and the text in any children). 

399 """ 

400 return _collect_string_content(self) 

401 

402 def cssselect(self, expr, translator='html'): 

403 """ 

404 Run the CSS expression on this element and its children, 

405 returning a list of the results. 

406 

407 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 

408 -- note that pre-compiling the expression can provide a substantial 

409 speedup. 

410 """ 

411 # Do the import here to make the dependency optional. 

412 from lxml.cssselect import CSSSelector 

413 return CSSSelector(expr, translator=translator)(self) 

414 

415 ######################################## 

416 ## Link functions 

417 ######################################## 

418 

419 def make_links_absolute(self, base_url=None, resolve_base_href=True, 

420 handle_failures=None): 

421 """ 

422 Make all links in the document absolute, given the 

423 ``base_url`` for the document (the full URL where the document 

424 came from), or if no ``base_url`` is given, then the ``.base_url`` 

425 of the document. 

426 

427 If ``resolve_base_href`` is true, then any ``<base href>`` 

428 tags in the document are used *and* removed from the document. 

429 If it is false then any such tag is ignored. 

430 

431 If ``handle_failures`` is None (default), a failure to process 

432 a URL will abort the processing. If set to 'ignore', errors 

433 are ignored. If set to 'discard', failing URLs will be removed. 

434 """ 

435 if base_url is None: 

436 base_url = self.base_url 

437 if base_url is None: 

438 raise TypeError( 

439 "No base_url given, and the document has no base_url") 

440 if resolve_base_href: 

441 self.resolve_base_href() 

442 

443 if handle_failures == 'ignore': 

444 def link_repl(href): 

445 try: 

446 return urljoin(base_url, href) 

447 except ValueError: 

448 return href 

449 elif handle_failures == 'discard': 

450 def link_repl(href): 

451 try: 

452 return urljoin(base_url, href) 

453 except ValueError: 

454 return None 

455 elif handle_failures is None: 

456 def link_repl(href): 

457 return urljoin(base_url, href) 

458 else: 

459 raise ValueError( 

460 "unexpected value for handle_failures: %r" % handle_failures) 

461 

462 self.rewrite_links(link_repl) 

463 

464 def resolve_base_href(self, handle_failures=None): 

465 """ 

466 Find any ``<base href>`` tag in the document, and apply its 

467 values to all links found in the document. Also remove the 

468 tag once it has been applied. 

469 

470 If ``handle_failures`` is None (default), a failure to process 

471 a URL will abort the processing. If set to 'ignore', errors 

472 are ignored. If set to 'discard', failing URLs will be removed. 

473 """ 

474 base_href = None 

475 basetags = self.xpath('//base[@href]|//x:base[@href]', 

476 namespaces={'x': XHTML_NAMESPACE}) 

477 for b in basetags: 

478 base_href = b.get('href') 

479 b.drop_tree() 

480 if not base_href: 

481 return 

482 self.make_links_absolute(base_href, resolve_base_href=False, 

483 handle_failures=handle_failures) 

484 

485 def iterlinks(self): 

486 """ 

487 Yield (element, attribute, link, pos), where attribute may be None 

488 (indicating the link is in the text). ``pos`` is the position 

489 where the link occurs; often 0, but sometimes something else in 

490 the case of links in stylesheets or style tags. 

491 

492 Note: <base href> is *not* taken into account in any way. The 

493 link you get is exactly the link in the document. 

494 

495 Note: multiple links inside of a single text string or 

496 attribute value are returned in reversed order. This makes it 

497 possible to replace or delete them from the text string value 

498 based on their reported text positions. Otherwise, a 

499 modification at one text position can change the positions of 

500 links reported later on. 

501 """ 

502 link_attrs = defs.link_attrs 

503 for el in self.iter(etree.Element): 

504 attribs = el.attrib 

505 tag = _nons(el.tag) 

506 if tag == 'object': 

507 codebase = None 

508 ## <object> tags have attributes that are relative to 

509 ## codebase 

510 if 'codebase' in attribs: 

511 codebase = el.get('codebase') 

512 yield (el, 'codebase', codebase, 0) 

513 for attrib in ('classid', 'data'): 

514 if attrib in attribs: 

515 value = el.get(attrib) 

516 if codebase is not None: 

517 value = urljoin(codebase, value) 

518 yield (el, attrib, value, 0) 

519 if 'archive' in attribs: 

520 for match in _archive_re.finditer(el.get('archive')): 

521 value = match.group(0) 

522 if codebase is not None: 

523 value = urljoin(codebase, value) 

524 yield (el, 'archive', value, match.start()) 

525 else: 

526 for attrib in link_attrs: 

527 if attrib in attribs: 

528 yield (el, attrib, attribs[attrib], 0) 

529 if tag == 'meta': 

530 http_equiv = attribs.get('http-equiv', '').lower() 

531 if http_equiv == 'refresh': 

532 content = attribs.get('content', '') 

533 match = _parse_meta_refresh_url(content) 

534 url = (match.group('url') if match else content).strip() 

535 # unexpected content means the redirect won't work, but we might 

536 # as well be permissive and return the entire string. 

537 if url: 

538 url, pos = _unquote_match( 

539 url, match.start('url') if match else content.find(url)) 

540 yield (el, 'content', url, pos) 

541 elif tag == 'param': 

542 valuetype = el.get('valuetype') or '' 

543 if valuetype.lower() == 'ref': 

544 ## FIXME: while it's fine we *find* this link, 

545 ## according to the spec we aren't supposed to 

546 ## actually change the value, including resolving 

547 ## it. It can also still be a link, even if it 

548 ## doesn't have a valuetype="ref" (which seems to be the norm) 

549 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 

550 yield (el, 'value', el.get('value'), 0) 

551 elif tag == 'style' and el.text: 

552 urls = [ 

553 # (start_pos, url) 

554 _unquote_match(match.group(1), match.start(1))[::-1] 

555 for match in _iter_css_urls(el.text) 

556 ] + [ 

557 (match.start(1), match.group(1)) 

558 for match in _iter_css_imports(el.text) 

559 ] 

560 if urls: 

561 # sort by start pos to bring both match sets back into order 

562 # and reverse the list to report correct positions despite 

563 # modifications 

564 urls.sort(reverse=True) 

565 for start, url in urls: 

566 yield (el, None, url, start) 

567 if 'style' in attribs: 

568 urls = list(_iter_css_urls(attribs['style'])) 

569 if urls: 

570 # return in reversed order to simplify in-place modifications 

571 for match in urls[::-1]: 

572 url, start = _unquote_match(match.group(1), match.start(1)) 

573 yield (el, 'style', url, start) 

574 

575 def rewrite_links(self, link_repl_func, resolve_base_href=True, 

576 base_href=None): 

577 """ 

578 Rewrite all the links in the document. For each link 

579 ``link_repl_func(link)`` will be called, and the return value 

580 will replace the old link. 

581 

582 Note that links may not be absolute (unless you first called 

583 ``make_links_absolute()``), and may be internal (e.g., 

584 ``'#anchor'``). They can also be values like 

585 ``'mailto:email'`` or ``'javascript:expr'``. 

586 

587 If you give ``base_href`` then all links passed to 

588 ``link_repl_func()`` will take that into account. 

589 

590 If the ``link_repl_func`` returns None, the attribute or 

591 tag text will be removed completely. 

592 """ 

593 if base_href is not None: 

594 # FIXME: this can be done in one pass with a wrapper 

595 # around link_repl_func 

596 self.make_links_absolute( 

597 base_href, resolve_base_href=resolve_base_href) 

598 elif resolve_base_href: 

599 self.resolve_base_href() 

600 

601 for el, attrib, link, pos in self.iterlinks(): 

602 new_link = link_repl_func(link.strip()) 

603 if new_link == link: 

604 continue 

605 if new_link is None: 

606 # Remove the attribute or element content 

607 if attrib is None: 

608 el.text = '' 

609 else: 

610 del el.attrib[attrib] 

611 continue 

612 

613 if attrib is None: 

614 new = el.text[:pos] + new_link + el.text[pos+len(link):] 

615 el.text = new 

616 else: 

617 cur = el.get(attrib) 

618 if not pos and len(cur) == len(link): 

619 new = new_link # most common case 

620 else: 

621 new = cur[:pos] + new_link + cur[pos+len(link):] 

622 el.set(attrib, new) 

623 

624 

625class _MethodFunc: 

626 """ 

627 An object that represents a method on an element as a function; 

628 the function takes either an element or an HTML string. It 

629 returns whatever the function normally returns, or if the function 

630 works in-place (and so returns None) it returns a serialized form 

631 of the resulting document. 

632 """ 

633 def __init__(self, name, copy=False, source_class=HtmlMixin): 

634 self.name = name 

635 self.copy = copy 

636 self.__doc__ = getattr(source_class, self.name).__doc__ 

637 def __call__(self, doc, *args, **kw): 

638 result_type = type(doc) 

639 if isinstance(doc, (str, bytes)): 

640 if 'copy' in kw: 

641 raise TypeError( 

642 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 

643 doc = fromstring(doc, **kw) 

644 else: 

645 if 'copy' in kw: 

646 make_a_copy = kw.pop('copy') 

647 else: 

648 make_a_copy = self.copy 

649 if make_a_copy: 

650 doc = copy.deepcopy(doc) 

651 meth = getattr(doc, self.name) 

652 result = meth(*args, **kw) 

653 # FIXME: this None test is a bit sloppy 

654 if result is None: 

655 # Then return what we got in 

656 return _transform_result(result_type, doc) 

657 else: 

658 return result 

659 

660 

661find_rel_links = _MethodFunc('find_rel_links', copy=False) 

662find_class = _MethodFunc('find_class', copy=False) 

663make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 

664resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 

665iterlinks = _MethodFunc('iterlinks', copy=False) 

666rewrite_links = _MethodFunc('rewrite_links', copy=True) 

667 

668 

669class HtmlComment(HtmlMixin, etree.CommentBase): 

670 pass 

671 

672 

673class HtmlElement(HtmlMixin, etree.ElementBase): 

674 pass 

675 

676 

677class HtmlProcessingInstruction(HtmlMixin, etree.PIBase): 

678 pass 

679 

680 

681class HtmlEntity(HtmlMixin, etree.EntityBase): 

682 pass 

683 

684 

685class HtmlElementClassLookup(etree.CustomElementClassLookup): 

686 """A lookup scheme for HTML Element classes. 

687 

688 To create a lookup instance with different Element classes, pass a tag 

689 name mapping of Element classes in the ``classes`` keyword argument and/or 

690 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 

691 The special key '*' denotes a Mixin class that should be mixed into all 

692 Element classes. 

693 """ 

694 _default_element_classes = {} 

695 

696 def __init__(self, classes=None, mixins=None): 

697 etree.CustomElementClassLookup.__init__(self) 

698 if classes is None: 

699 classes = self._default_element_classes.copy() 

700 if mixins: 

701 mixers = {} 

702 for name, value in mixins: 

703 if name == '*': 

704 for n in classes.keys(): 

705 mixers.setdefault(n, []).append(value) 

706 else: 

707 mixers.setdefault(name, []).append(value) 

708 for name, mix_bases in mixers.items(): 

709 cur = classes.get(name, HtmlElement) 

710 bases = tuple(mix_bases + [cur]) 

711 classes[name] = type(cur.__name__, bases, {}) 

712 self._element_classes = classes 

713 

714 def lookup(self, node_type, document, namespace, name): 

715 if node_type == 'element': 

716 return self._element_classes.get(name.lower(), HtmlElement) 

717 elif node_type == 'comment': 

718 return HtmlComment 

719 elif node_type == 'PI': 

720 return HtmlProcessingInstruction 

721 elif node_type == 'entity': 

722 return HtmlEntity 

723 # Otherwise normal lookup 

724 return None 

725 

726 

727################################################################################ 

728# parsing 

729################################################################################ 

730 

731_looks_like_full_html_unicode = re.compile( 

732 r'^\s*<(?:html|!doctype)', re.I).match 

733_looks_like_full_html_bytes = re.compile( 

734 br'^\s*<(?:html|!doctype)', re.I).match 

735 

736 

737def document_fromstring(html, parser=None, ensure_head_body=False, **kw): 

738 if parser is None: 

739 parser = html_parser 

740 value = etree.fromstring(html, parser, **kw) 

741 if value is None: 

742 raise etree.ParserError( 

743 "Document is empty") 

744 if ensure_head_body and value.find('head') is None: 

745 value.insert(0, Element('head')) 

746 if ensure_head_body and value.find('body') is None: 

747 value.append(Element('body')) 

748 return value 

749 

750 

751def fragments_fromstring(html, no_leading_text=False, base_url=None, 

752 parser=None, **kw): 

753 """Parses several HTML elements, returning a list of elements. 

754 

755 The first item in the list may be a string. 

756 If no_leading_text is true, then it will be an error if there is 

757 leading text, and it will always be a list of only elements. 

758 

759 base_url will set the document's base_url attribute 

760 (and the tree's docinfo.URL). 

761 """ 

762 if parser is None: 

763 parser = html_parser 

764 # FIXME: check what happens when you give html with a body, head, etc. 

765 if isinstance(html, bytes): 

766 if not _looks_like_full_html_bytes(html): 

767 # can't use %-formatting in early Py3 versions 

768 html = (b'<html><body>' + html + 

769 b'</body></html>') 

770 else: 

771 if not _looks_like_full_html_unicode(html): 

772 html = '<html><body>%s</body></html>' % html 

773 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

774 assert _nons(doc.tag) == 'html' 

775 bodies = [e for e in doc if _nons(e.tag) == 'body'] 

776 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 

777 body = bodies[0] 

778 elements = [] 

779 if no_leading_text and body.text and body.text.strip(): 

780 raise etree.ParserError( 

781 "There is leading text: %r" % body.text) 

782 if body.text and body.text.strip(): 

783 elements.append(body.text) 

784 elements.extend(body) 

785 # FIXME: removing the reference to the parent artificial document 

786 # would be nice 

787 return elements 

788 

789 

790def fragment_fromstring(html, create_parent=False, base_url=None, 

791 parser=None, **kw): 

792 """ 

793 Parses a single HTML element; it is an error if there is more than 

794 one element, or if anything but whitespace precedes or follows the 

795 element. 

796 

797 If ``create_parent`` is true (or is a tag name) then a parent node 

798 will be created to encapsulate the HTML in a single element. In this 

799 case, leading or trailing text is also allowed, as are multiple elements 

800 as result of the parsing. 

801 

802 Passing a ``base_url`` will set the document's ``base_url`` attribute 

803 (and the tree's docinfo.URL). 

804 """ 

805 if parser is None: 

806 parser = html_parser 

807 

808 accept_leading_text = bool(create_parent) 

809 

810 elements = fragments_fromstring( 

811 html, parser=parser, no_leading_text=not accept_leading_text, 

812 base_url=base_url, **kw) 

813 

814 if create_parent: 

815 if not isinstance(create_parent, str): 

816 create_parent = 'div' 

817 new_root = Element(create_parent) 

818 if elements: 

819 if isinstance(elements[0], str): 

820 new_root.text = elements[0] 

821 del elements[0] 

822 new_root.extend(elements) 

823 return new_root 

824 

825 if not elements: 

826 raise etree.ParserError('No elements found') 

827 if len(elements) > 1: 

828 raise etree.ParserError( 

829 "Multiple elements found (%s)" 

830 % ', '.join([_element_name(e) for e in elements])) 

831 el = elements[0] 

832 if el.tail and el.tail.strip(): 

833 raise etree.ParserError( 

834 "Element followed by text: %r" % el.tail) 

835 el.tail = None 

836 return el 

837 

838 

839def fromstring(html, base_url=None, parser=None, **kw): 

840 """ 

841 Parse the html, returning a single element/document. 

842 

843 This tries to minimally parse the chunk of text, without knowing if it 

844 is a fragment or a document. 

845 

846 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 

847 """ 

848 if parser is None: 

849 parser = html_parser 

850 if isinstance(html, bytes): 

851 is_full_html = _looks_like_full_html_bytes(html) 

852 else: 

853 is_full_html = _looks_like_full_html_unicode(html) 

854 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

855 if is_full_html: 

856 return doc 

857 # otherwise, lets parse it out... 

858 bodies = doc.findall('body') 

859 if not bodies: 

860 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 

861 if bodies: 

862 body = bodies[0] 

863 if len(bodies) > 1: 

864 # Somehow there are multiple bodies, which is bad, but just 

865 # smash them into one body 

866 for other_body in bodies[1:]: 

867 if other_body.text: 

868 if len(body): 

869 body[-1].tail = (body[-1].tail or '') + other_body.text 

870 else: 

871 body.text = (body.text or '') + other_body.text 

872 body.extend(other_body) 

873 # We'll ignore tail 

874 # I guess we are ignoring attributes too 

875 other_body.drop_tree() 

876 else: 

877 body = None 

878 heads = doc.findall('head') 

879 if not heads: 

880 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 

881 if heads: 

882 # Well, we have some sort of structure, so lets keep it all 

883 head = heads[0] 

884 if len(heads) > 1: 

885 for other_head in heads[1:]: 

886 head.extend(other_head) 

887 # We don't care about text or tail in a head 

888 other_head.drop_tree() 

889 return doc 

890 if body is None: 

891 return doc 

892 if (len(body) == 1 and (not body.text or not body.text.strip()) 

893 and (not body[-1].tail or not body[-1].tail.strip())): 

894 # The body has just one element, so it was probably a single 

895 # element passed in 

896 return body[0] 

897 # Now we have a body which represents a bunch of tags which have the 

898 # content that was passed in. We will create a fake container, which 

899 # is the body tag, except <body> implies too much structure. 

900 if _contains_block_level_tag(body): 

901 body.tag = 'div' 

902 else: 

903 body.tag = 'span' 

904 return body 

905 

906 

907def parse(filename_or_url, parser=None, base_url=None, **kw): 

908 """ 

909 Parse a filename, URL, or file-like object into an HTML document 

910 tree. Note: this returns a tree, not an element. Use 

911 ``parse(...).getroot()`` to get the document root. 

912 

913 You can override the base URL with the ``base_url`` keyword. This 

914 is most useful when parsing from a file-like object. 

915 """ 

916 if parser is None: 

917 parser = html_parser 

918 return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 

919 

920 

921def _contains_block_level_tag(el): 

922 # FIXME: I could do this with XPath, but would that just be 

923 # unnecessarily slow? 

924 for el in el.iter(etree.Element): 

925 if _nons(el.tag) in defs.block_tags: 

926 return True 

927 return False 

928 

929 

930def _element_name(el): 

931 if isinstance(el, etree.CommentBase): 

932 return 'comment' 

933 elif isinstance(el, str): 

934 return 'string' 

935 else: 

936 return _nons(el.tag) 

937 

938 

939################################################################################ 

940# form handling 

941################################################################################ 

942 

943class FormElement(HtmlElement): 

944 """ 

945 Represents a <form> element. 

946 """ 

947 

948 @property 

949 def inputs(self): 

950 """ 

951 Returns an accessor for all the input elements in the form. 

952 

953 See `InputGetter` for more information about the object. 

954 """ 

955 return InputGetter(self) 

956 

957 @property 

958 def fields(self): 

959 """ 

960 Dictionary-like object that represents all the fields in this 

961 form. You can set values in this dictionary to effect the 

962 form. 

963 """ 

964 return FieldsDict(self.inputs) 

965 

966 @fields.setter 

967 def fields(self, value): 

968 fields = self.fields 

969 prev_keys = fields.keys() 

970 for key, value in value.items(): 

971 if key in prev_keys: 

972 prev_keys.remove(key) 

973 fields[key] = value 

974 for key in prev_keys: 

975 if key is None: 

976 # Case of an unnamed input; these aren't really 

977 # expressed in form_values() anyway. 

978 continue 

979 fields[key] = None 

980 

981 def _name(self): 

982 if self.get('name'): 

983 return self.get('name') 

984 elif self.get('id'): 

985 return '#' + self.get('id') 

986 iter_tags = self.body.iter 

987 forms = list(iter_tags('form')) 

988 if not forms: 

989 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 

990 return str(forms.index(self)) 

991 

992 def form_values(self): 

993 """ 

994 Return a list of tuples of the field values for the form. 

995 This is suitable to be passed to ``urllib.urlencode()``. 

996 """ 

997 results = [] 

998 for el in self.inputs: 

999 name = el.name 

1000 if not name or 'disabled' in el.attrib: 

1001 continue 

1002 tag = _nons(el.tag) 

1003 if tag == 'textarea': 

1004 results.append((name, el.value)) 

1005 elif tag == 'select': 

1006 value = el.value 

1007 if el.multiple: 

1008 for v in value: 

1009 results.append((name, v)) 

1010 elif value is not None: 

1011 results.append((name, el.value)) 

1012 else: 

1013 assert tag == 'input', ( 

1014 "Unexpected tag: %r" % el) 

1015 if el.checkable and not el.checked: 

1016 continue 

1017 if el.type in ('submit', 'image', 'reset', 'file'): 

1018 continue 

1019 value = el.value 

1020 if value is not None: 

1021 results.append((name, el.value)) 

1022 return results 

1023 

1024 @property 

1025 def action(self): 

1026 """ 

1027 Get/set the form's ``action`` attribute. 

1028 """ 

1029 base_url = self.base_url 

1030 action = self.get('action') 

1031 if base_url and action is not None: 

1032 return urljoin(base_url, action) 

1033 else: 

1034 return action 

1035 

1036 @action.setter 

1037 def action(self, value): 

1038 self.set('action', value) 

1039 

1040 @action.deleter 

1041 def action(self): 

1042 attrib = self.attrib 

1043 if 'action' in attrib: 

1044 del attrib['action'] 

1045 

1046 @property 

1047 def method(self): 

1048 """ 

1049 Get/set the form's method. Always returns a capitalized 

1050 string, and defaults to ``'GET'`` 

1051 """ 

1052 return self.get('method', 'GET').upper() 

1053 

1054 @method.setter 

1055 def method(self, value): 

1056 self.set('method', value.upper()) 

1057 

1058 

1059HtmlElementClassLookup._default_element_classes['form'] = FormElement 

1060 

1061 

1062def submit_form(form, extra_values=None, open_http=None): 

1063 """ 

1064 Helper function to submit a form. Returns a file-like object, as from 

1065 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 

1066 which shows the URL if there were any redirects. 

1067 

1068 You can use this like:: 

1069 

1070 form = doc.forms[0] 

1071 form.inputs['foo'].value = 'bar' # etc 

1072 response = form.submit() 

1073 doc = parse(response) 

1074 doc.make_links_absolute(response.geturl()) 

1075 

1076 To change the HTTP requester, pass a function as ``open_http`` keyword 

1077 argument that opens the URL for you. The function must have the following 

1078 signature:: 

1079 

1080 open_http(method, URL, values) 

1081 

1082 The action is one of 'GET' or 'POST', the URL is the target URL as a 

1083 string, and the values are a sequence of ``(name, value)`` tuples with the 

1084 form data. 

1085 """ 

1086 values = form.form_values() 

1087 if extra_values: 

1088 if hasattr(extra_values, 'items'): 

1089 extra_values = extra_values.items() 

1090 values.extend(extra_values) 

1091 if open_http is None: 

1092 open_http = open_http_urllib 

1093 if form.action: 

1094 url = form.action 

1095 else: 

1096 url = form.base_url 

1097 return open_http(form.method, url, values) 

1098 

1099 

1100def open_http_urllib(method, url, values): 

1101 if not url: 

1102 raise ValueError("cannot submit, no URL provided") 

1103 ## FIXME: should test that it's not a relative URL or something 

1104 try: 

1105 from urllib import urlencode, urlopen 

1106 except ImportError: # Python 3 

1107 from urllib.request import urlopen 

1108 from urllib.parse import urlencode 

1109 if method == 'GET': 

1110 if '?' in url: 

1111 url += '&' 

1112 else: 

1113 url += '?' 

1114 url += urlencode(values) 

1115 data = None 

1116 else: 

1117 data = urlencode(values) 

1118 if not isinstance(data, bytes): 

1119 data = data.encode('ASCII') 

1120 return urlopen(url, data) 

1121 

1122 

1123class FieldsDict(MutableMapping): 

1124 

1125 def __init__(self, inputs): 

1126 self.inputs = inputs 

1127 def __getitem__(self, item): 

1128 return self.inputs[item].value 

1129 def __setitem__(self, item, value): 

1130 self.inputs[item].value = value 

1131 def __delitem__(self, item): 

1132 raise KeyError( 

1133 "You cannot remove keys from ElementDict") 

1134 def keys(self): 

1135 return self.inputs.keys() 

1136 def __contains__(self, item): 

1137 return item in self.inputs 

1138 def __iter__(self): 

1139 return iter(self.inputs.keys()) 

1140 def __len__(self): 

1141 return len(self.inputs) 

1142 

1143 def __repr__(self): 

1144 return '<%s for form %s>' % ( 

1145 self.__class__.__name__, 

1146 self.inputs.form._name()) 

1147 

1148 

1149class InputGetter: 

1150 

1151 """ 

1152 An accessor that represents all the input fields in a form. 

1153 

1154 You can get fields by name from this, with 

1155 ``form.inputs['field_name']``. If there are a set of checkboxes 

1156 with the same name, they are returned as a list (a `CheckboxGroup` 

1157 which also allows value setting). Radio inputs are handled 

1158 similarly. Use ``.keys()`` and ``.items()`` to process all fields 

1159 in this way. 

1160 

1161 You can also iterate over this to get all input elements. This 

1162 won't return the same thing as if you get all the names, as 

1163 checkboxes and radio elements are returned individually. 

1164 """ 

1165 

1166 def __init__(self, form): 

1167 self.form = form 

1168 

1169 def __repr__(self): 

1170 return '<%s for form %s>' % ( 

1171 self.__class__.__name__, 

1172 self.form._name()) 

1173 

1174 ## FIXME: there should be more methods, and it's unclear if this is 

1175 ## a dictionary-like object or list-like object 

1176 

1177 def __getitem__(self, name): 

1178 fields = [field for field in self if field.name == name] 

1179 if not fields: 

1180 raise KeyError("No input element with the name %r" % name) 

1181 

1182 input_type = fields[0].get('type') 

1183 if input_type == 'radio' and len(fields) > 1: 

1184 group = RadioGroup(fields) 

1185 group.name = name 

1186 return group 

1187 elif input_type == 'checkbox' and len(fields) > 1: 

1188 group = CheckboxGroup(fields) 

1189 group.name = name 

1190 return group 

1191 else: 

1192 # I don't like throwing away elements like this 

1193 return fields[0] 

1194 

1195 def __contains__(self, name): 

1196 for field in self: 

1197 if field.name == name: 

1198 return True 

1199 return False 

1200 

1201 def keys(self): 

1202 """ 

1203 Returns all unique field names, in document order. 

1204 

1205 :return: A list of all unique field names. 

1206 """ 

1207 names = [] 

1208 seen = {None} 

1209 for el in self: 

1210 name = el.name 

1211 if name not in seen: 

1212 names.append(name) 

1213 seen.add(name) 

1214 return names 

1215 

1216 def items(self): 

1217 """ 

1218 Returns all fields with their names, similar to dict.items(). 

1219 

1220 :return: A list of (name, field) tuples. 

1221 """ 

1222 items = [] 

1223 seen = set() 

1224 for el in self: 

1225 name = el.name 

1226 if name not in seen: 

1227 seen.add(name) 

1228 items.append((name, self[name])) 

1229 return items 

1230 

1231 def __iter__(self): 

1232 return self.form.iter('select', 'input', 'textarea') 

1233 

1234 def __len__(self): 

1235 return sum(1 for _ in self) 

1236 

1237 

1238class InputMixin: 

1239 """ 

1240 Mix-in for all input elements (input, select, and textarea) 

1241 """ 

1242 @property 

1243 def name(self): 

1244 """ 

1245 Get/set the name of the element 

1246 """ 

1247 return self.get('name') 

1248 

1249 @name.setter 

1250 def name(self, value): 

1251 self.set('name', value) 

1252 

1253 @name.deleter 

1254 def name(self): 

1255 attrib = self.attrib 

1256 if 'name' in attrib: 

1257 del attrib['name'] 

1258 

1259 def __repr__(self): 

1260 type_name = getattr(self, 'type', None) 

1261 if type_name: 

1262 type_name = ' type=%r' % type_name 

1263 else: 

1264 type_name = '' 

1265 return '<%s %x name=%r%s>' % ( 

1266 self.__class__.__name__, id(self), self.name, type_name) 

1267 

1268 

1269class TextareaElement(InputMixin, HtmlElement): 

1270 """ 

1271 ``<textarea>`` element. You can get the name with ``.name`` and 

1272 get/set the value with ``.value`` 

1273 """ 

1274 @property 

1275 def value(self): 

1276 """ 

1277 Get/set the value (which is the contents of this element) 

1278 """ 

1279 content = self.text or '' 

1280 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 

1281 serialisation_method = 'xml' 

1282 else: 

1283 serialisation_method = 'html' 

1284 for el in self: 

1285 # it's rare that we actually get here, so let's not use ''.join() 

1286 content += etree.tostring( 

1287 el, method=serialisation_method, encoding='unicode') 

1288 return content 

1289 

1290 @value.setter 

1291 def value(self, value): 

1292 del self[:] 

1293 self.text = value 

1294 

1295 @value.deleter 

1296 def value(self): 

1297 self.text = '' 

1298 del self[:] 

1299 

1300 

1301HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 

1302 

1303 

1304class SelectElement(InputMixin, HtmlElement): 

1305 """ 

1306 ``<select>`` element. You can get the name with ``.name``. 

1307 

1308 ``.value`` will be the value of the selected option, unless this 

1309 is a multi-select element (``<select multiple>``), in which case 

1310 it will be a set-like object. In either case ``.value_options`` 

1311 gives the possible values. 

1312 

1313 The boolean attribute ``.multiple`` shows if this is a 

1314 multi-select. 

1315 """ 

1316 @property 

1317 def value(self): 

1318 """ 

1319 Get/set the value of this select (the selected option). 

1320 

1321 If this is a multi-select, this is a set-like object that 

1322 represents all the selected options. 

1323 """ 

1324 if self.multiple: 

1325 return MultipleSelectOptions(self) 

1326 options = _options_xpath(self) 

1327 

1328 try: 

1329 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 

1330 except StopIteration: 

1331 try: 

1332 selected_option = next(el for el in options if el.get('disabled') is None) 

1333 except StopIteration: 

1334 return None 

1335 value = selected_option.get('value') 

1336 if value is None: 

1337 value = (selected_option.text or '').strip() 

1338 return value 

1339 

1340 @value.setter 

1341 def value(self, value): 

1342 if self.multiple: 

1343 if isinstance(value, str): 

1344 raise TypeError("You must pass in a sequence") 

1345 values = self.value 

1346 values.clear() 

1347 values.update(value) 

1348 return 

1349 checked_option = None 

1350 if value is not None: 

1351 for el in _options_xpath(self): 

1352 opt_value = el.get('value') 

1353 if opt_value is None: 

1354 opt_value = (el.text or '').strip() 

1355 if opt_value == value: 

1356 checked_option = el 

1357 break 

1358 else: 

1359 raise ValueError( 

1360 "There is no option with the value of %r" % value) 

1361 for el in _options_xpath(self): 

1362 if 'selected' in el.attrib: 

1363 del el.attrib['selected'] 

1364 if checked_option is not None: 

1365 checked_option.set('selected', '') 

1366 

1367 @value.deleter 

1368 def value(self): 

1369 # FIXME: should del be allowed at all? 

1370 if self.multiple: 

1371 self.value.clear() 

1372 else: 

1373 self.value = None 

1374 

1375 @property 

1376 def value_options(self): 

1377 """ 

1378 All the possible values this select can have (the ``value`` 

1379 attribute of all the ``<option>`` elements. 

1380 """ 

1381 options = [] 

1382 for el in _options_xpath(self): 

1383 value = el.get('value') 

1384 if value is None: 

1385 value = (el.text or '').strip() 

1386 options.append(value) 

1387 return options 

1388 

1389 @property 

1390 def multiple(self): 

1391 """ 

1392 Boolean attribute: is there a ``multiple`` attribute on this element. 

1393 """ 

1394 return 'multiple' in self.attrib 

1395 

1396 @multiple.setter 

1397 def multiple(self, value): 

1398 if value: 

1399 self.set('multiple', '') 

1400 elif 'multiple' in self.attrib: 

1401 del self.attrib['multiple'] 

1402 

1403 

1404HtmlElementClassLookup._default_element_classes['select'] = SelectElement 

1405 

1406 

1407class MultipleSelectOptions(SetMixin): 

1408 """ 

1409 Represents all the selected options in a ``<select multiple>`` element. 

1410 

1411 You can add to this set-like option to select an option, or remove 

1412 to unselect the option. 

1413 """ 

1414 

1415 def __init__(self, select): 

1416 self.select = select 

1417 

1418 @property 

1419 def options(self): 

1420 """ 

1421 Iterator of all the ``<option>`` elements. 

1422 """ 

1423 return iter(_options_xpath(self.select)) 

1424 

1425 def __iter__(self): 

1426 for option in self.options: 

1427 if 'selected' in option.attrib: 

1428 opt_value = option.get('value') 

1429 if opt_value is None: 

1430 opt_value = (option.text or '').strip() 

1431 yield opt_value 

1432 

1433 def add(self, item): 

1434 for option in self.options: 

1435 opt_value = option.get('value') 

1436 if opt_value is None: 

1437 opt_value = (option.text or '').strip() 

1438 if opt_value == item: 

1439 option.set('selected', '') 

1440 break 

1441 else: 

1442 raise ValueError( 

1443 "There is no option with the value %r" % item) 

1444 

1445 def remove(self, item): 

1446 for option in self.options: 

1447 opt_value = option.get('value') 

1448 if opt_value is None: 

1449 opt_value = (option.text or '').strip() 

1450 if opt_value == item: 

1451 if 'selected' in option.attrib: 

1452 del option.attrib['selected'] 

1453 else: 

1454 raise ValueError( 

1455 "The option %r is not currently selected" % item) 

1456 break 

1457 else: 

1458 raise ValueError( 

1459 "There is not option with the value %r" % item) 

1460 

1461 def __repr__(self): 

1462 return '<%s {%s} for select name=%r>' % ( 

1463 self.__class__.__name__, 

1464 ', '.join([repr(v) for v in self]), 

1465 self.select.name) 

1466 

1467 

1468class RadioGroup(list): 

1469 """ 

1470 This object represents several ``<input type=radio>`` elements 

1471 that have the same name. 

1472 

1473 You can use this like a list, but also use the property 

1474 ``.value`` to check/uncheck inputs. Also you can use 

1475 ``.value_options`` to get the possible values. 

1476 """ 

1477 @property 

1478 def value(self): 

1479 """ 

1480 Get/set the value, which checks the radio with that value (and 

1481 unchecks any other value). 

1482 """ 

1483 for el in self: 

1484 if 'checked' in el.attrib: 

1485 return el.get('value') 

1486 return None 

1487 

1488 @value.setter 

1489 def value(self, value): 

1490 checked_option = None 

1491 if value is not None: 

1492 for el in self: 

1493 if el.get('value') == value: 

1494 checked_option = el 

1495 break 

1496 else: 

1497 raise ValueError("There is no radio input with the value %r" % value) 

1498 for el in self: 

1499 if 'checked' in el.attrib: 

1500 del el.attrib['checked'] 

1501 if checked_option is not None: 

1502 checked_option.set('checked', '') 

1503 

1504 @value.deleter 

1505 def value(self): 

1506 self.value = None 

1507 

1508 @property 

1509 def value_options(self): 

1510 """ 

1511 Returns a list of all the possible values. 

1512 """ 

1513 return [el.get('value') for el in self] 

1514 

1515 def __repr__(self): 

1516 return '%s(%s)' % ( 

1517 self.__class__.__name__, 

1518 list.__repr__(self)) 

1519 

1520 

1521class CheckboxGroup(list): 

1522 """ 

1523 Represents a group of checkboxes (``<input type=checkbox>``) that 

1524 have the same name. 

1525 

1526 In addition to using this like a list, the ``.value`` attribute 

1527 returns a set-like object that you can add to or remove from to 

1528 check and uncheck checkboxes. You can also use ``.value_options`` 

1529 to get the possible values. 

1530 """ 

1531 @property 

1532 def value(self): 

1533 """ 

1534 Return a set-like object that can be modified to check or 

1535 uncheck individual checkboxes according to their value. 

1536 """ 

1537 return CheckboxValues(self) 

1538 

1539 @value.setter 

1540 def value(self, value): 

1541 values = self.value 

1542 values.clear() 

1543 if not hasattr(value, '__iter__'): 

1544 raise ValueError( 

1545 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 

1546 % (self[0].name, value)) 

1547 values.update(value) 

1548 

1549 @value.deleter 

1550 def value(self): 

1551 self.value.clear() 

1552 

1553 @property 

1554 def value_options(self): 

1555 """ 

1556 Returns a list of all the possible values. 

1557 """ 

1558 return [el.get('value') for el in self] 

1559 

1560 def __repr__(self): 

1561 return '%s(%s)' % ( 

1562 self.__class__.__name__, list.__repr__(self)) 

1563 

1564 

1565class CheckboxValues(SetMixin): 

1566 """ 

1567 Represents the values of the checked checkboxes in a group of 

1568 checkboxes with the same name. 

1569 """ 

1570 

1571 def __init__(self, group): 

1572 self.group = group 

1573 

1574 def __iter__(self): 

1575 return iter([ 

1576 el.get('value') 

1577 for el in self.group 

1578 if 'checked' in el.attrib]) 

1579 

1580 def add(self, value): 

1581 for el in self.group: 

1582 if el.get('value') == value: 

1583 el.set('checked', '') 

1584 break 

1585 else: 

1586 raise KeyError("No checkbox with value %r" % value) 

1587 

1588 def remove(self, value): 

1589 for el in self.group: 

1590 if el.get('value') == value: 

1591 if 'checked' in el.attrib: 

1592 del el.attrib['checked'] 

1593 else: 

1594 raise KeyError( 

1595 "The checkbox with value %r was already unchecked" % value) 

1596 break 

1597 else: 

1598 raise KeyError( 

1599 "No checkbox with value %r" % value) 

1600 

1601 def __repr__(self): 

1602 return '<%s {%s} for checkboxes name=%r>' % ( 

1603 self.__class__.__name__, 

1604 ', '.join([repr(v) for v in self]), 

1605 self.group.name) 

1606 

1607 

1608class InputElement(InputMixin, HtmlElement): 

1609 """ 

1610 Represents an ``<input>`` element. 

1611 

1612 You can get the type with ``.type`` (which is lower-cased and 

1613 defaults to ``'text'``). 

1614 

1615 Also you can get and set the value with ``.value`` 

1616 

1617 Checkboxes and radios have the attribute ``input.checkable == 

1618 True`` (for all others it is false) and a boolean attribute 

1619 ``.checked``. 

1620 

1621 """ 

1622 

1623 ## FIXME: I'm a little uncomfortable with the use of .checked 

1624 @property 

1625 def value(self): 

1626 """ 

1627 Get/set the value of this element, using the ``value`` attribute. 

1628 

1629 Also, if this is a checkbox and it has no value, this defaults 

1630 to ``'on'``. If it is a checkbox or radio that is not 

1631 checked, this returns None. 

1632 """ 

1633 if self.checkable: 

1634 if self.checked: 

1635 return self.get('value') or 'on' 

1636 else: 

1637 return None 

1638 return self.get('value') 

1639 

1640 @value.setter 

1641 def value(self, value): 

1642 if self.checkable: 

1643 if not value: 

1644 self.checked = False 

1645 else: 

1646 self.checked = True 

1647 if isinstance(value, str): 

1648 self.set('value', value) 

1649 else: 

1650 self.set('value', value) 

1651 

1652 @value.deleter 

1653 def value(self): 

1654 if self.checkable: 

1655 self.checked = False 

1656 else: 

1657 if 'value' in self.attrib: 

1658 del self.attrib['value'] 

1659 

1660 @property 

1661 def type(self): 

1662 """ 

1663 Return the type of this element (using the type attribute). 

1664 """ 

1665 return self.get('type', 'text').lower() 

1666 

1667 @type.setter 

1668 def type(self, value): 

1669 self.set('type', value) 

1670 

1671 @property 

1672 def checkable(self): 

1673 """ 

1674 Boolean: can this element be checked? 

1675 """ 

1676 return self.type in ('checkbox', 'radio') 

1677 

1678 @property 

1679 def checked(self): 

1680 """ 

1681 Boolean attribute to get/set the presence of the ``checked`` 

1682 attribute. 

1683 

1684 You can only use this on checkable input types. 

1685 """ 

1686 if not self.checkable: 

1687 raise AttributeError('Not a checkable input type') 

1688 return 'checked' in self.attrib 

1689 

1690 @checked.setter 

1691 def checked(self, value): 

1692 if not self.checkable: 

1693 raise AttributeError('Not a checkable input type') 

1694 if value: 

1695 self.set('checked', '') 

1696 else: 

1697 attrib = self.attrib 

1698 if 'checked' in attrib: 

1699 del attrib['checked'] 

1700 

1701 

1702HtmlElementClassLookup._default_element_classes['input'] = InputElement 

1703 

1704 

1705class LabelElement(HtmlElement): 

1706 """ 

1707 Represents a ``<label>`` element. 

1708 

1709 Label elements are linked to other elements with their ``for`` 

1710 attribute. You can access this element with ``label.for_element``. 

1711 """ 

1712 @property 

1713 def for_element(self): 

1714 """ 

1715 Get/set the element this label points to. Return None if it 

1716 can't be found. 

1717 """ 

1718 id = self.get('for') 

1719 if not id: 

1720 return None 

1721 return self.body.get_element_by_id(id) 

1722 

1723 @for_element.setter 

1724 def for_element(self, other): 

1725 id = other.get('id') 

1726 if not id: 

1727 raise TypeError( 

1728 "Element %r has no id attribute" % other) 

1729 self.set('for', id) 

1730 

1731 @for_element.deleter 

1732 def for_element(self): 

1733 attrib = self.attrib 

1734 if 'id' in attrib: 

1735 del attrib['id'] 

1736 

1737 

1738HtmlElementClassLookup._default_element_classes['label'] = LabelElement 

1739 

1740 

1741############################################################ 

1742## Serialization 

1743############################################################ 

1744 

1745def html_to_xhtml(html): 

1746 """Convert all tags in an HTML tree to XHTML by moving them to the 

1747 XHTML namespace. 

1748 """ 

1749 try: 

1750 html = html.getroot() 

1751 except AttributeError: 

1752 pass 

1753 prefix = "{%s}" % XHTML_NAMESPACE 

1754 for el in html.iter(etree.Element): 

1755 tag = el.tag 

1756 if tag[0] != '{': 

1757 el.tag = prefix + tag 

1758 

1759 

1760def xhtml_to_html(xhtml): 

1761 """Convert all tags in an XHTML tree to HTML by removing their 

1762 XHTML namespace. 

1763 """ 

1764 try: 

1765 xhtml = xhtml.getroot() 

1766 except AttributeError: 

1767 pass 

1768 prefix = "{%s}" % XHTML_NAMESPACE 

1769 prefix_len = len(prefix) 

1770 for el in xhtml.iter(prefix + "*"): 

1771 el.tag = el.tag[prefix_len:] 

1772 

1773 

1774# This isn't a general match, but it's a match for what libxml2 

1775# specifically serialises: 

1776__str_replace_meta_content_type = re.compile( 

1777 r'<meta http-equiv="Content-Type"[^>]*>').sub 

1778__bytes_replace_meta_content_type = re.compile( 

1779 br'<meta http-equiv="Content-Type"[^>]*>').sub 

1780 

1781 

1782def tostring(doc, pretty_print=False, include_meta_content_type=False, 

1783 encoding=None, method="html", with_tail=True, doctype=None): 

1784 """Return an HTML string representation of the document. 

1785 

1786 Note: if include_meta_content_type is true this will create a 

1787 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 

1788 regardless of the value of include_meta_content_type any existing 

1789 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 

1790 

1791 The ``encoding`` argument controls the output encoding (defaults to 

1792 ASCII, with &#...; character references for any characters outside 

1793 of ASCII). Note that you can pass the name ``'unicode'`` as 

1794 ``encoding`` argument to serialise to a Unicode string. 

1795 

1796 The ``method`` argument defines the output method. It defaults to 

1797 'html', but can also be 'xml' for xhtml output, or 'text' to 

1798 serialise to plain text without markup. 

1799 

1800 To leave out the tail text of the top-level element that is being 

1801 serialised, pass ``with_tail=False``. 

1802 

1803 The ``doctype`` option allows passing in a plain string that will 

1804 be serialised before the XML tree. Note that passing in non 

1805 well-formed content here will make the XML output non well-formed. 

1806 Also, an existing doctype in the document tree will not be removed 

1807 when serialising an ElementTree instance. 

1808 

1809 Example:: 

1810 

1811 >>> from lxml import html 

1812 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 

1813 

1814 >>> html.tostring(root) 

1815 b'<p>Hello<br>world!</p>' 

1816 >>> html.tostring(root, method='html') 

1817 b'<p>Hello<br>world!</p>' 

1818 

1819 >>> html.tostring(root, method='xml') 

1820 b'<p>Hello<br/>world!</p>' 

1821 

1822 >>> html.tostring(root, method='text') 

1823 b'Helloworld!' 

1824 

1825 >>> html.tostring(root, method='text', encoding='unicode') 

1826 u'Helloworld!' 

1827 

1828 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 

1829 >>> html.tostring(root[0], method='text', encoding='unicode') 

1830 u'Helloworld!TAIL' 

1831 

1832 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 

1833 u'Helloworld!' 

1834 

1835 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 

1836 >>> html.tostring(doc, method='html', encoding='unicode') 

1837 u'<html><body><p>Hello<br>world!</p></body></html>' 

1838 

1839 >>> print(html.tostring(doc, method='html', encoding='unicode', 

1840 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 

1841 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 

1842 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 

1843 <html><body><p>Hello<br>world!</p></body></html> 

1844 """ 

1845 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 

1846 encoding=encoding, with_tail=with_tail, 

1847 doctype=doctype) 

1848 if method == 'html' and not include_meta_content_type: 

1849 if isinstance(html, str): 

1850 html = __str_replace_meta_content_type('', html) 

1851 else: 

1852 html = __bytes_replace_meta_content_type(b'', html) 

1853 return html 

1854 

1855 

1856tostring.__doc__ = __fix_docstring(tostring.__doc__) 

1857 

1858 

1859def open_in_browser(doc, encoding=None): 

1860 """ 

1861 Open the HTML document in a web browser, saving it to a temporary 

1862 file to open it. Note that this does not delete the file after 

1863 use. This is mainly meant for debugging. 

1864 """ 

1865 import os 

1866 import webbrowser 

1867 import tempfile 

1868 if not isinstance(doc, etree._ElementTree): 

1869 doc = etree.ElementTree(doc) 

1870 handle, fn = tempfile.mkstemp(suffix='.html') 

1871 f = os.fdopen(handle, 'wb') 

1872 try: 

1873 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 

1874 finally: 

1875 # we leak the file itself here, but we should at least close it 

1876 f.close() 

1877 url = 'file://' + fn.replace(os.path.sep, '/') 

1878 print(url) 

1879 webbrowser.open(url) 

1880 

1881 

1882################################################################################ 

1883# configure Element class lookup 

1884################################################################################ 

1885 

1886class HTMLParser(etree.HTMLParser): 

1887 """An HTML parser that is configured to return lxml.html Element 

1888 objects. 

1889 """ 

1890 def __init__(self, **kwargs): 

1891 super().__init__(**kwargs) 

1892 self.set_element_class_lookup(HtmlElementClassLookup()) 

1893 

1894 

1895class XHTMLParser(etree.XMLParser): 

1896 """An XML parser that is configured to return lxml.html Element 

1897 objects. 

1898 

1899 Note that this parser is not really XHTML aware unless you let it 

1900 load a DTD that declares the HTML entities. To do this, make sure 

1901 you have the XHTML DTDs installed in your catalogs, and create the 

1902 parser like this:: 

1903 

1904 >>> parser = XHTMLParser(load_dtd=True) 

1905 

1906 If you additionally want to validate the document, use this:: 

1907 

1908 >>> parser = XHTMLParser(dtd_validation=True) 

1909 

1910 For catalog support, see http://www.xmlsoft.org/catalog.html. 

1911 """ 

1912 def __init__(self, **kwargs): 

1913 super().__init__(**kwargs) 

1914 self.set_element_class_lookup(HtmlElementClassLookup()) 

1915 

1916 

1917def Element(*args, **kw): 

1918 """Create a new HTML Element. 

1919 

1920 This can also be used for XHTML documents. 

1921 """ 

1922 v = html_parser.makeelement(*args, **kw) 

1923 return v 

1924 

1925 

1926html_parser = HTMLParser() 

1927xhtml_parser = XHTMLParser()