Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lxml/html/__init__.py: 4%

985 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:27 +0000

1# Copyright (c) 2004 Ian Bicking. All rights reserved. 

2# 

3# Redistribution and use in source and binary forms, with or without 

4# modification, are permitted provided that the following conditions are 

5# met: 

6# 

7# 1. Redistributions of source code must retain the above copyright 

8# notice, this list of conditions and the following disclaimer. 

9# 

10# 2. Redistributions in binary form must reproduce the above copyright 

11# notice, this list of conditions and the following disclaimer in 

12# the documentation and/or other materials provided with the 

13# distribution. 

14# 

15# 3. Neither the name of Ian Bicking nor the names of its contributors may 

16# be used to endorse or promote products derived from this software 

17# without specific prior written permission. 

18# 

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

30 

31"""The ``lxml.html`` tool set for HTML handling. 

32""" 

33 

34from __future__ import absolute_import 

35 

36__all__ = [ 

37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 

38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 

39 'find_rel_links', 'find_class', 'make_links_absolute', 

40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] 

41 

42 

43import copy 

44import sys 

45import re 

46from functools import partial 

47 

48try: 

49 from collections.abc import MutableMapping, MutableSet 

50except ImportError: 

51 from collections import MutableMapping, MutableSet 

52 

53from .. import etree 

54from . import defs 

55from ._setmixin import SetMixin 

56 

57try: 

58 from urlparse import urljoin 

59except ImportError: 

60 # Python 3 

61 from urllib.parse import urljoin 

62 

63try: 

64 unicode 

65except NameError: 

66 # Python 3 

67 unicode = str 

68try: 

69 basestring 

70except NameError: 

71 # Python 3 

72 basestring = (str, bytes) 

73 

74 

75def __fix_docstring(s): 

76 if not s: 

77 return s 

78 if sys.version_info[0] >= 3: 

79 sub = re.compile(r"^(\s*)u'", re.M).sub 

80 else: 

81 sub = re.compile(r"^(\s*)b'", re.M).sub 

82 return sub(r"\1'", s) 

83 

84 

85XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 

86 

87_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 

88 namespaces={'x':XHTML_NAMESPACE}) 

89_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 

90 namespaces={'x':XHTML_NAMESPACE}) 

91_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 

92 namespaces={'x':XHTML_NAMESPACE}) 

93#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 

94_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 

95_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 

96_collect_string_content = etree.XPath("string()") 

97_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer 

98_iter_css_imports = re.compile(r'@import "(.*?)"').finditer 

99_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 

100 namespaces={'x':XHTML_NAMESPACE}) 

101_archive_re = re.compile(r'[^ ]+') 

102_parse_meta_refresh_url = re.compile( 

103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 

104 

105 

106def _unquote_match(s, pos): 

107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 

108 return s[1:-1], pos+1 

109 else: 

110 return s,pos 

111 

112 

113def _transform_result(typ, result): 

114 """Convert the result back into the input type. 

115 """ 

116 if issubclass(typ, bytes): 

117 return tostring(result, encoding='utf-8') 

118 elif issubclass(typ, unicode): 

119 return tostring(result, encoding='unicode') 

120 else: 

121 return result 

122 

123 

124def _nons(tag): 

125 if isinstance(tag, basestring): 

126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 

127 return tag.split('}')[-1] 

128 return tag 

129 

130 

131class Classes(MutableSet): 

132 """Provides access to an element's class attribute as a set-like collection. 

133 Usage:: 

134 

135 >>> el = fromstring('<p class="hidden large">Text</p>') 

136 >>> classes = el.classes # or: classes = Classes(el.attrib) 

137 >>> classes |= ['block', 'paragraph'] 

138 >>> el.get('class') 

139 'hidden large block paragraph' 

140 >>> classes.toggle('hidden') 

141 False 

142 >>> el.get('class') 

143 'large block paragraph' 

144 >>> classes -= ('some', 'classes', 'block') 

145 >>> el.get('class') 

146 'large paragraph' 

147 """ 

148 def __init__(self, attributes): 

149 self._attributes = attributes 

150 self._get_class_value = partial(attributes.get, 'class', '') 

151 

152 def add(self, value): 

153 """ 

154 Add a class. 

155 

156 This has no effect if the class is already present. 

157 """ 

158 if not value or re.search(r'\s', value): 

159 raise ValueError("Invalid class name: %r" % value) 

160 classes = self._get_class_value().split() 

161 if value in classes: 

162 return 

163 classes.append(value) 

164 self._attributes['class'] = ' '.join(classes) 

165 

166 def discard(self, value): 

167 """ 

168 Remove a class if it is currently present. 

169 

170 If the class is not present, do nothing. 

171 """ 

172 if not value or re.search(r'\s', value): 

173 raise ValueError("Invalid class name: %r" % value) 

174 classes = [name for name in self._get_class_value().split() 

175 if name != value] 

176 if classes: 

177 self._attributes['class'] = ' '.join(classes) 

178 elif 'class' in self._attributes: 

179 del self._attributes['class'] 

180 

181 def remove(self, value): 

182 """ 

183 Remove a class; it must currently be present. 

184 

185 If the class is not present, raise a KeyError. 

186 """ 

187 if not value or re.search(r'\s', value): 

188 raise ValueError("Invalid class name: %r" % value) 

189 super(Classes, self).remove(value) 

190 

191 def __contains__(self, name): 

192 classes = self._get_class_value() 

193 return name in classes and name in classes.split() 

194 

195 def __iter__(self): 

196 return iter(self._get_class_value().split()) 

197 

198 def __len__(self): 

199 return len(self._get_class_value().split()) 

200 

201 # non-standard methods 

202 

203 def update(self, values): 

204 """ 

205 Add all names from 'values'. 

206 """ 

207 classes = self._get_class_value().split() 

208 extended = False 

209 for value in values: 

210 if value not in classes: 

211 classes.append(value) 

212 extended = True 

213 if extended: 

214 self._attributes['class'] = ' '.join(classes) 

215 

216 def toggle(self, value): 

217 """ 

218 Add a class name if it isn't there yet, or remove it if it exists. 

219 

220 Returns true if the class was added (and is now enabled) and 

221 false if it was removed (and is now disabled). 

222 """ 

223 if not value or re.search(r'\s', value): 

224 raise ValueError("Invalid class name: %r" % value) 

225 classes = self._get_class_value().split() 

226 try: 

227 classes.remove(value) 

228 enabled = False 

229 except ValueError: 

230 classes.append(value) 

231 enabled = True 

232 if classes: 

233 self._attributes['class'] = ' '.join(classes) 

234 else: 

235 del self._attributes['class'] 

236 return enabled 

237 

238 

239class HtmlMixin(object): 

240 

241 def set(self, key, value=None): 

242 """set(self, key, value=None) 

243 

244 Sets an element attribute. If no value is provided, or if the value is None, 

245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 

246 for ``form.set('novalidate')``. 

247 """ 

248 super(HtmlMixin, self).set(key, value) 

249 

250 @property 

251 def classes(self): 

252 """ 

253 A set-like wrapper around the 'class' attribute. 

254 """ 

255 return Classes(self.attrib) 

256 

257 @classes.setter 

258 def classes(self, classes): 

259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 

260 value = classes._get_class_value() 

261 if value: 

262 self.set('class', value) 

263 elif self.get('class') is not None: 

264 del self.attrib['class'] 

265 

266 @property 

267 def base_url(self): 

268 """ 

269 Returns the base URL, given when the page was parsed. 

270 

271 Use with ``urlparse.urljoin(el.base_url, href)`` to get 

272 absolute URLs. 

273 """ 

274 return self.getroottree().docinfo.URL 

275 

276 @property 

277 def forms(self): 

278 """ 

279 Return a list of all the forms 

280 """ 

281 return _forms_xpath(self) 

282 

283 @property 

284 def body(self): 

285 """ 

286 Return the <body> element. Can be called from a child element 

287 to get the document's head. 

288 """ 

289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 

290 

291 @property 

292 def head(self): 

293 """ 

294 Returns the <head> element. Can be called from a child 

295 element to get the document's head. 

296 """ 

297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 

298 

299 @property 

300 def label(self): 

301 """ 

302 Get or set any <label> element associated with this element. 

303 """ 

304 id = self.get('id') 

305 if not id: 

306 return None 

307 result = _label_xpath(self, id=id) 

308 if not result: 

309 return None 

310 else: 

311 return result[0] 

312 

313 @label.setter 

314 def label(self, label): 

315 id = self.get('id') 

316 if not id: 

317 raise TypeError( 

318 "You cannot set a label for an element (%r) that has no id" 

319 % self) 

320 if _nons(label.tag) != 'label': 

321 raise TypeError( 

322 "You can only assign label to a label element (not %r)" 

323 % label) 

324 label.set('for', id) 

325 

326 @label.deleter 

327 def label(self): 

328 label = self.label 

329 if label is not None: 

330 del label.attrib['for'] 

331 

332 def drop_tree(self): 

333 """ 

334 Removes this element from the tree, including its children and 

335 text. The tail text is joined to the previous element or 

336 parent. 

337 """ 

338 parent = self.getparent() 

339 assert parent is not None 

340 if self.tail: 

341 previous = self.getprevious() 

342 if previous is None: 

343 parent.text = (parent.text or '') + self.tail 

344 else: 

345 previous.tail = (previous.tail or '') + self.tail 

346 parent.remove(self) 

347 

348 def drop_tag(self): 

349 """ 

350 Remove the tag, but not its children or text. The children and text 

351 are merged into the parent. 

352 

353 Example:: 

354 

355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 

356 >>> h.find('.//b').drop_tag() 

357 >>> print(tostring(h, encoding='unicode')) 

358 <div>Hello World!</div> 

359 """ 

360 parent = self.getparent() 

361 assert parent is not None 

362 previous = self.getprevious() 

363 if self.text and isinstance(self.tag, basestring): 

364 # not a Comment, etc. 

365 if previous is None: 

366 parent.text = (parent.text or '') + self.text 

367 else: 

368 previous.tail = (previous.tail or '') + self.text 

369 if self.tail: 

370 if len(self): 

371 last = self[-1] 

372 last.tail = (last.tail or '') + self.tail 

373 elif previous is None: 

374 parent.text = (parent.text or '') + self.tail 

375 else: 

376 previous.tail = (previous.tail or '') + self.tail 

377 index = parent.index(self) 

378 parent[index:index+1] = self[:] 

379 

380 def find_rel_links(self, rel): 

381 """ 

382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 

383 """ 

384 rel = rel.lower() 

385 return [el for el in _rel_links_xpath(self) 

386 if el.get('rel').lower() == rel] 

387 

388 def find_class(self, class_name): 

389 """ 

390 Find any elements with the given class name. 

391 """ 

392 return _class_xpath(self, class_name=class_name) 

393 

394 def get_element_by_id(self, id, *default): 

395 """ 

396 Get the first element in a document with the given id. If none is 

397 found, return the default argument if provided or raise KeyError 

398 otherwise. 

399 

400 Note that there can be more than one element with the same id, 

401 and this isn't uncommon in HTML documents found in the wild. 

402 Browsers return only the first match, and this function does 

403 the same. 

404 """ 

405 try: 

406 # FIXME: should this check for multiple matches? 

407 # browsers just return the first one 

408 return _id_xpath(self, id=id)[0] 

409 except IndexError: 

410 if default: 

411 return default[0] 

412 else: 

413 raise KeyError(id) 

414 

415 def text_content(self): 

416 """ 

417 Return the text content of the tag (and the text in any children). 

418 """ 

419 return _collect_string_content(self) 

420 

421 def cssselect(self, expr, translator='html'): 

422 """ 

423 Run the CSS expression on this element and its children, 

424 returning a list of the results. 

425 

426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 

427 -- note that pre-compiling the expression can provide a substantial 

428 speedup. 

429 """ 

430 # Do the import here to make the dependency optional. 

431 from lxml.cssselect import CSSSelector 

432 return CSSSelector(expr, translator=translator)(self) 

433 

434 ######################################## 

435 ## Link functions 

436 ######################################## 

437 

438 def make_links_absolute(self, base_url=None, resolve_base_href=True, 

439 handle_failures=None): 

440 """ 

441 Make all links in the document absolute, given the 

442 ``base_url`` for the document (the full URL where the document 

443 came from), or if no ``base_url`` is given, then the ``.base_url`` 

444 of the document. 

445 

446 If ``resolve_base_href`` is true, then any ``<base href>`` 

447 tags in the document are used *and* removed from the document. 

448 If it is false then any such tag is ignored. 

449 

450 If ``handle_failures`` is None (default), a failure to process 

451 a URL will abort the processing. If set to 'ignore', errors 

452 are ignored. If set to 'discard', failing URLs will be removed. 

453 """ 

454 if base_url is None: 

455 base_url = self.base_url 

456 if base_url is None: 

457 raise TypeError( 

458 "No base_url given, and the document has no base_url") 

459 if resolve_base_href: 

460 self.resolve_base_href() 

461 

462 if handle_failures == 'ignore': 

463 def link_repl(href): 

464 try: 

465 return urljoin(base_url, href) 

466 except ValueError: 

467 return href 

468 elif handle_failures == 'discard': 

469 def link_repl(href): 

470 try: 

471 return urljoin(base_url, href) 

472 except ValueError: 

473 return None 

474 elif handle_failures is None: 

475 def link_repl(href): 

476 return urljoin(base_url, href) 

477 else: 

478 raise ValueError( 

479 "unexpected value for handle_failures: %r" % handle_failures) 

480 

481 self.rewrite_links(link_repl) 

482 

483 def resolve_base_href(self, handle_failures=None): 

484 """ 

485 Find any ``<base href>`` tag in the document, and apply its 

486 values to all links found in the document. Also remove the 

487 tag once it has been applied. 

488 

489 If ``handle_failures`` is None (default), a failure to process 

490 a URL will abort the processing. If set to 'ignore', errors 

491 are ignored. If set to 'discard', failing URLs will be removed. 

492 """ 

493 base_href = None 

494 basetags = self.xpath('//base[@href]|//x:base[@href]', 

495 namespaces={'x': XHTML_NAMESPACE}) 

496 for b in basetags: 

497 base_href = b.get('href') 

498 b.drop_tree() 

499 if not base_href: 

500 return 

501 self.make_links_absolute(base_href, resolve_base_href=False, 

502 handle_failures=handle_failures) 

503 

504 def iterlinks(self): 

505 """ 

506 Yield (element, attribute, link, pos), where attribute may be None 

507 (indicating the link is in the text). ``pos`` is the position 

508 where the link occurs; often 0, but sometimes something else in 

509 the case of links in stylesheets or style tags. 

510 

511 Note: <base href> is *not* taken into account in any way. The 

512 link you get is exactly the link in the document. 

513 

514 Note: multiple links inside of a single text string or 

515 attribute value are returned in reversed order. This makes it 

516 possible to replace or delete them from the text string value 

517 based on their reported text positions. Otherwise, a 

518 modification at one text position can change the positions of 

519 links reported later on. 

520 """ 

521 link_attrs = defs.link_attrs 

522 for el in self.iter(etree.Element): 

523 attribs = el.attrib 

524 tag = _nons(el.tag) 

525 if tag == 'object': 

526 codebase = None 

527 ## <object> tags have attributes that are relative to 

528 ## codebase 

529 if 'codebase' in attribs: 

530 codebase = el.get('codebase') 

531 yield (el, 'codebase', codebase, 0) 

532 for attrib in ('classid', 'data'): 

533 if attrib in attribs: 

534 value = el.get(attrib) 

535 if codebase is not None: 

536 value = urljoin(codebase, value) 

537 yield (el, attrib, value, 0) 

538 if 'archive' in attribs: 

539 for match in _archive_re.finditer(el.get('archive')): 

540 value = match.group(0) 

541 if codebase is not None: 

542 value = urljoin(codebase, value) 

543 yield (el, 'archive', value, match.start()) 

544 else: 

545 for attrib in link_attrs: 

546 if attrib in attribs: 

547 yield (el, attrib, attribs[attrib], 0) 

548 if tag == 'meta': 

549 http_equiv = attribs.get('http-equiv', '').lower() 

550 if http_equiv == 'refresh': 

551 content = attribs.get('content', '') 

552 match = _parse_meta_refresh_url(content) 

553 url = (match.group('url') if match else content).strip() 

554 # unexpected content means the redirect won't work, but we might 

555 # as well be permissive and return the entire string. 

556 if url: 

557 url, pos = _unquote_match( 

558 url, match.start('url') if match else content.find(url)) 

559 yield (el, 'content', url, pos) 

560 elif tag == 'param': 

561 valuetype = el.get('valuetype') or '' 

562 if valuetype.lower() == 'ref': 

563 ## FIXME: while it's fine we *find* this link, 

564 ## according to the spec we aren't supposed to 

565 ## actually change the value, including resolving 

566 ## it. It can also still be a link, even if it 

567 ## doesn't have a valuetype="ref" (which seems to be the norm) 

568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 

569 yield (el, 'value', el.get('value'), 0) 

570 elif tag == 'style' and el.text: 

571 urls = [ 

572 # (start_pos, url) 

573 _unquote_match(match.group(1), match.start(1))[::-1] 

574 for match in _iter_css_urls(el.text) 

575 ] + [ 

576 (match.start(1), match.group(1)) 

577 for match in _iter_css_imports(el.text) 

578 ] 

579 if urls: 

580 # sort by start pos to bring both match sets back into order 

581 # and reverse the list to report correct positions despite 

582 # modifications 

583 urls.sort(reverse=True) 

584 for start, url in urls: 

585 yield (el, None, url, start) 

586 if 'style' in attribs: 

587 urls = list(_iter_css_urls(attribs['style'])) 

588 if urls: 

589 # return in reversed order to simplify in-place modifications 

590 for match in urls[::-1]: 

591 url, start = _unquote_match(match.group(1), match.start(1)) 

592 yield (el, 'style', url, start) 

593 

594 def rewrite_links(self, link_repl_func, resolve_base_href=True, 

595 base_href=None): 

596 """ 

597 Rewrite all the links in the document. For each link 

598 ``link_repl_func(link)`` will be called, and the return value 

599 will replace the old link. 

600 

601 Note that links may not be absolute (unless you first called 

602 ``make_links_absolute()``), and may be internal (e.g., 

603 ``'#anchor'``). They can also be values like 

604 ``'mailto:email'`` or ``'javascript:expr'``. 

605 

606 If you give ``base_href`` then all links passed to 

607 ``link_repl_func()`` will take that into account. 

608 

609 If the ``link_repl_func`` returns None, the attribute or 

610 tag text will be removed completely. 

611 """ 

612 if base_href is not None: 

613 # FIXME: this can be done in one pass with a wrapper 

614 # around link_repl_func 

615 self.make_links_absolute( 

616 base_href, resolve_base_href=resolve_base_href) 

617 elif resolve_base_href: 

618 self.resolve_base_href() 

619 

620 for el, attrib, link, pos in self.iterlinks(): 

621 new_link = link_repl_func(link.strip()) 

622 if new_link == link: 

623 continue 

624 if new_link is None: 

625 # Remove the attribute or element content 

626 if attrib is None: 

627 el.text = '' 

628 else: 

629 del el.attrib[attrib] 

630 continue 

631 

632 if attrib is None: 

633 new = el.text[:pos] + new_link + el.text[pos+len(link):] 

634 el.text = new 

635 else: 

636 cur = el.get(attrib) 

637 if not pos and len(cur) == len(link): 

638 new = new_link # most common case 

639 else: 

640 new = cur[:pos] + new_link + cur[pos+len(link):] 

641 el.set(attrib, new) 

642 

643 

644class _MethodFunc(object): 

645 """ 

646 An object that represents a method on an element as a function; 

647 the function takes either an element or an HTML string. It 

648 returns whatever the function normally returns, or if the function 

649 works in-place (and so returns None) it returns a serialized form 

650 of the resulting document. 

651 """ 

652 def __init__(self, name, copy=False, source_class=HtmlMixin): 

653 self.name = name 

654 self.copy = copy 

655 self.__doc__ = getattr(source_class, self.name).__doc__ 

656 def __call__(self, doc, *args, **kw): 

657 result_type = type(doc) 

658 if isinstance(doc, basestring): 

659 if 'copy' in kw: 

660 raise TypeError( 

661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 

662 doc = fromstring(doc, **kw) 

663 else: 

664 if 'copy' in kw: 

665 make_a_copy = kw.pop('copy') 

666 else: 

667 make_a_copy = self.copy 

668 if make_a_copy: 

669 doc = copy.deepcopy(doc) 

670 meth = getattr(doc, self.name) 

671 result = meth(*args, **kw) 

672 # FIXME: this None test is a bit sloppy 

673 if result is None: 

674 # Then return what we got in 

675 return _transform_result(result_type, doc) 

676 else: 

677 return result 

678 

679 

680find_rel_links = _MethodFunc('find_rel_links', copy=False) 

681find_class = _MethodFunc('find_class', copy=False) 

682make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 

683resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 

684iterlinks = _MethodFunc('iterlinks', copy=False) 

685rewrite_links = _MethodFunc('rewrite_links', copy=True) 

686 

687 

688class HtmlComment(HtmlMixin, etree.CommentBase): 

689 pass 

690 

691 

692class HtmlElement(HtmlMixin, etree.ElementBase): 

693 pass 

694 

695 

696class HtmlProcessingInstruction(HtmlMixin, etree.PIBase): 

697 pass 

698 

699 

700class HtmlEntity(HtmlMixin, etree.EntityBase): 

701 pass 

702 

703 

704class HtmlElementClassLookup(etree.CustomElementClassLookup): 

705 """A lookup scheme for HTML Element classes. 

706 

707 To create a lookup instance with different Element classes, pass a tag 

708 name mapping of Element classes in the ``classes`` keyword argument and/or 

709 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 

710 The special key '*' denotes a Mixin class that should be mixed into all 

711 Element classes. 

712 """ 

713 _default_element_classes = {} 

714 

715 def __init__(self, classes=None, mixins=None): 

716 etree.CustomElementClassLookup.__init__(self) 

717 if classes is None: 

718 classes = self._default_element_classes.copy() 

719 if mixins: 

720 mixers = {} 

721 for name, value in mixins: 

722 if name == '*': 

723 for n in classes.keys(): 

724 mixers.setdefault(n, []).append(value) 

725 else: 

726 mixers.setdefault(name, []).append(value) 

727 for name, mix_bases in mixers.items(): 

728 cur = classes.get(name, HtmlElement) 

729 bases = tuple(mix_bases + [cur]) 

730 classes[name] = type(cur.__name__, bases, {}) 

731 self._element_classes = classes 

732 

733 def lookup(self, node_type, document, namespace, name): 

734 if node_type == 'element': 

735 return self._element_classes.get(name.lower(), HtmlElement) 

736 elif node_type == 'comment': 

737 return HtmlComment 

738 elif node_type == 'PI': 

739 return HtmlProcessingInstruction 

740 elif node_type == 'entity': 

741 return HtmlEntity 

742 # Otherwise normal lookup 

743 return None 

744 

745 

746################################################################################ 

747# parsing 

748################################################################################ 

749 

750_looks_like_full_html_unicode = re.compile( 

751 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 

752_looks_like_full_html_bytes = re.compile( 

753 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 

754 

755 

756def document_fromstring(html, parser=None, ensure_head_body=False, **kw): 

757 if parser is None: 

758 parser = html_parser 

759 value = etree.fromstring(html, parser, **kw) 

760 if value is None: 

761 raise etree.ParserError( 

762 "Document is empty") 

763 if ensure_head_body and value.find('head') is None: 

764 value.insert(0, Element('head')) 

765 if ensure_head_body and value.find('body') is None: 

766 value.append(Element('body')) 

767 return value 

768 

769 

770def fragments_fromstring(html, no_leading_text=False, base_url=None, 

771 parser=None, **kw): 

772 """Parses several HTML elements, returning a list of elements. 

773 

774 The first item in the list may be a string. 

775 If no_leading_text is true, then it will be an error if there is 

776 leading text, and it will always be a list of only elements. 

777 

778 base_url will set the document's base_url attribute 

779 (and the tree's docinfo.URL). 

780 """ 

781 if parser is None: 

782 parser = html_parser 

783 # FIXME: check what happens when you give html with a body, head, etc. 

784 if isinstance(html, bytes): 

785 if not _looks_like_full_html_bytes(html): 

786 # can't use %-formatting in early Py3 versions 

787 html = ('<html><body>'.encode('ascii') + html + 

788 '</body></html>'.encode('ascii')) 

789 else: 

790 if not _looks_like_full_html_unicode(html): 

791 html = '<html><body>%s</body></html>' % html 

792 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

793 assert _nons(doc.tag) == 'html' 

794 bodies = [e for e in doc if _nons(e.tag) == 'body'] 

795 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 

796 body = bodies[0] 

797 elements = [] 

798 if no_leading_text and body.text and body.text.strip(): 

799 raise etree.ParserError( 

800 "There is leading text: %r" % body.text) 

801 if body.text and body.text.strip(): 

802 elements.append(body.text) 

803 elements.extend(body) 

804 # FIXME: removing the reference to the parent artificial document 

805 # would be nice 

806 return elements 

807 

808 

809def fragment_fromstring(html, create_parent=False, base_url=None, 

810 parser=None, **kw): 

811 """ 

812 Parses a single HTML element; it is an error if there is more than 

813 one element, or if anything but whitespace precedes or follows the 

814 element. 

815 

816 If ``create_parent`` is true (or is a tag name) then a parent node 

817 will be created to encapsulate the HTML in a single element. In this 

818 case, leading or trailing text is also allowed, as are multiple elements 

819 as result of the parsing. 

820 

821 Passing a ``base_url`` will set the document's ``base_url`` attribute 

822 (and the tree's docinfo.URL). 

823 """ 

824 if parser is None: 

825 parser = html_parser 

826 

827 accept_leading_text = bool(create_parent) 

828 

829 elements = fragments_fromstring( 

830 html, parser=parser, no_leading_text=not accept_leading_text, 

831 base_url=base_url, **kw) 

832 

833 if create_parent: 

834 if not isinstance(create_parent, basestring): 

835 create_parent = 'div' 

836 new_root = Element(create_parent) 

837 if elements: 

838 if isinstance(elements[0], basestring): 

839 new_root.text = elements[0] 

840 del elements[0] 

841 new_root.extend(elements) 

842 return new_root 

843 

844 if not elements: 

845 raise etree.ParserError('No elements found') 

846 if len(elements) > 1: 

847 raise etree.ParserError( 

848 "Multiple elements found (%s)" 

849 % ', '.join([_element_name(e) for e in elements])) 

850 el = elements[0] 

851 if el.tail and el.tail.strip(): 

852 raise etree.ParserError( 

853 "Element followed by text: %r" % el.tail) 

854 el.tail = None 

855 return el 

856 

857 

858def fromstring(html, base_url=None, parser=None, **kw): 

859 """ 

860 Parse the html, returning a single element/document. 

861 

862 This tries to minimally parse the chunk of text, without knowing if it 

863 is a fragment or a document. 

864 

865 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 

866 """ 

867 if parser is None: 

868 parser = html_parser 

869 if isinstance(html, bytes): 

870 is_full_html = _looks_like_full_html_bytes(html) 

871 else: 

872 is_full_html = _looks_like_full_html_unicode(html) 

873 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 

874 if is_full_html: 

875 return doc 

876 # otherwise, lets parse it out... 

877 bodies = doc.findall('body') 

878 if not bodies: 

879 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 

880 if bodies: 

881 body = bodies[0] 

882 if len(bodies) > 1: 

883 # Somehow there are multiple bodies, which is bad, but just 

884 # smash them into one body 

885 for other_body in bodies[1:]: 

886 if other_body.text: 

887 if len(body): 

888 body[-1].tail = (body[-1].tail or '') + other_body.text 

889 else: 

890 body.text = (body.text or '') + other_body.text 

891 body.extend(other_body) 

892 # We'll ignore tail 

893 # I guess we are ignoring attributes too 

894 other_body.drop_tree() 

895 else: 

896 body = None 

897 heads = doc.findall('head') 

898 if not heads: 

899 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 

900 if heads: 

901 # Well, we have some sort of structure, so lets keep it all 

902 head = heads[0] 

903 if len(heads) > 1: 

904 for other_head in heads[1:]: 

905 head.extend(other_head) 

906 # We don't care about text or tail in a head 

907 other_head.drop_tree() 

908 return doc 

909 if body is None: 

910 return doc 

911 if (len(body) == 1 and (not body.text or not body.text.strip()) 

912 and (not body[-1].tail or not body[-1].tail.strip())): 

913 # The body has just one element, so it was probably a single 

914 # element passed in 

915 return body[0] 

916 # Now we have a body which represents a bunch of tags which have the 

917 # content that was passed in. We will create a fake container, which 

918 # is the body tag, except <body> implies too much structure. 

919 if _contains_block_level_tag(body): 

920 body.tag = 'div' 

921 else: 

922 body.tag = 'span' 

923 return body 

924 

925 

926def parse(filename_or_url, parser=None, base_url=None, **kw): 

927 """ 

928 Parse a filename, URL, or file-like object into an HTML document 

929 tree. Note: this returns a tree, not an element. Use 

930 ``parse(...).getroot()`` to get the document root. 

931 

932 You can override the base URL with the ``base_url`` keyword. This 

933 is most useful when parsing from a file-like object. 

934 """ 

935 if parser is None: 

936 parser = html_parser 

937 return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 

938 

939 

940def _contains_block_level_tag(el): 

941 # FIXME: I could do this with XPath, but would that just be 

942 # unnecessarily slow? 

943 for el in el.iter(etree.Element): 

944 if _nons(el.tag) in defs.block_tags: 

945 return True 

946 return False 

947 

948 

949def _element_name(el): 

950 if isinstance(el, etree.CommentBase): 

951 return 'comment' 

952 elif isinstance(el, basestring): 

953 return 'string' 

954 else: 

955 return _nons(el.tag) 

956 

957 

958################################################################################ 

959# form handling 

960################################################################################ 

961 

962class FormElement(HtmlElement): 

963 """ 

964 Represents a <form> element. 

965 """ 

966 

967 @property 

968 def inputs(self): 

969 """ 

970 Returns an accessor for all the input elements in the form. 

971 

972 See `InputGetter` for more information about the object. 

973 """ 

974 return InputGetter(self) 

975 

976 @property 

977 def fields(self): 

978 """ 

979 Dictionary-like object that represents all the fields in this 

980 form. You can set values in this dictionary to effect the 

981 form. 

982 """ 

983 return FieldsDict(self.inputs) 

984 

985 @fields.setter 

986 def fields(self, value): 

987 fields = self.fields 

988 prev_keys = fields.keys() 

989 for key, value in value.items(): 

990 if key in prev_keys: 

991 prev_keys.remove(key) 

992 fields[key] = value 

993 for key in prev_keys: 

994 if key is None: 

995 # Case of an unnamed input; these aren't really 

996 # expressed in form_values() anyway. 

997 continue 

998 fields[key] = None 

999 

1000 def _name(self): 

1001 if self.get('name'): 

1002 return self.get('name') 

1003 elif self.get('id'): 

1004 return '#' + self.get('id') 

1005 iter_tags = self.body.iter 

1006 forms = list(iter_tags('form')) 

1007 if not forms: 

1008 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 

1009 return str(forms.index(self)) 

1010 

1011 def form_values(self): 

1012 """ 

1013 Return a list of tuples of the field values for the form. 

1014 This is suitable to be passed to ``urllib.urlencode()``. 

1015 """ 

1016 results = [] 

1017 for el in self.inputs: 

1018 name = el.name 

1019 if not name or 'disabled' in el.attrib: 

1020 continue 

1021 tag = _nons(el.tag) 

1022 if tag == 'textarea': 

1023 results.append((name, el.value)) 

1024 elif tag == 'select': 

1025 value = el.value 

1026 if el.multiple: 

1027 for v in value: 

1028 results.append((name, v)) 

1029 elif value is not None: 

1030 results.append((name, el.value)) 

1031 else: 

1032 assert tag == 'input', ( 

1033 "Unexpected tag: %r" % el) 

1034 if el.checkable and not el.checked: 

1035 continue 

1036 if el.type in ('submit', 'image', 'reset', 'file'): 

1037 continue 

1038 value = el.value 

1039 if value is not None: 

1040 results.append((name, el.value)) 

1041 return results 

1042 

1043 @property 

1044 def action(self): 

1045 """ 

1046 Get/set the form's ``action`` attribute. 

1047 """ 

1048 base_url = self.base_url 

1049 action = self.get('action') 

1050 if base_url and action is not None: 

1051 return urljoin(base_url, action) 

1052 else: 

1053 return action 

1054 

1055 @action.setter 

1056 def action(self, value): 

1057 self.set('action', value) 

1058 

1059 @action.deleter 

1060 def action(self): 

1061 attrib = self.attrib 

1062 if 'action' in attrib: 

1063 del attrib['action'] 

1064 

1065 @property 

1066 def method(self): 

1067 """ 

1068 Get/set the form's method. Always returns a capitalized 

1069 string, and defaults to ``'GET'`` 

1070 """ 

1071 return self.get('method', 'GET').upper() 

1072 

1073 @method.setter 

1074 def method(self, value): 

1075 self.set('method', value.upper()) 

1076 

1077 

1078HtmlElementClassLookup._default_element_classes['form'] = FormElement 

1079 

1080 

1081def submit_form(form, extra_values=None, open_http=None): 

1082 """ 

1083 Helper function to submit a form. Returns a file-like object, as from 

1084 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 

1085 which shows the URL if there were any redirects. 

1086 

1087 You can use this like:: 

1088 

1089 form = doc.forms[0] 

1090 form.inputs['foo'].value = 'bar' # etc 

1091 response = form.submit() 

1092 doc = parse(response) 

1093 doc.make_links_absolute(response.geturl()) 

1094 

1095 To change the HTTP requester, pass a function as ``open_http`` keyword 

1096 argument that opens the URL for you. The function must have the following 

1097 signature:: 

1098 

1099 open_http(method, URL, values) 

1100 

1101 The action is one of 'GET' or 'POST', the URL is the target URL as a 

1102 string, and the values are a sequence of ``(name, value)`` tuples with the 

1103 form data. 

1104 """ 

1105 values = form.form_values() 

1106 if extra_values: 

1107 if hasattr(extra_values, 'items'): 

1108 extra_values = extra_values.items() 

1109 values.extend(extra_values) 

1110 if open_http is None: 

1111 open_http = open_http_urllib 

1112 if form.action: 

1113 url = form.action 

1114 else: 

1115 url = form.base_url 

1116 return open_http(form.method, url, values) 

1117 

1118 

1119def open_http_urllib(method, url, values): 

1120 if not url: 

1121 raise ValueError("cannot submit, no URL provided") 

1122 ## FIXME: should test that it's not a relative URL or something 

1123 try: 

1124 from urllib import urlencode, urlopen 

1125 except ImportError: # Python 3 

1126 from urllib.request import urlopen 

1127 from urllib.parse import urlencode 

1128 if method == 'GET': 

1129 if '?' in url: 

1130 url += '&' 

1131 else: 

1132 url += '?' 

1133 url += urlencode(values) 

1134 data = None 

1135 else: 

1136 data = urlencode(values) 

1137 if not isinstance(data, bytes): 

1138 data = data.encode('ASCII') 

1139 return urlopen(url, data) 

1140 

1141 

1142class FieldsDict(MutableMapping): 

1143 

1144 def __init__(self, inputs): 

1145 self.inputs = inputs 

1146 def __getitem__(self, item): 

1147 return self.inputs[item].value 

1148 def __setitem__(self, item, value): 

1149 self.inputs[item].value = value 

1150 def __delitem__(self, item): 

1151 raise KeyError( 

1152 "You cannot remove keys from ElementDict") 

1153 def keys(self): 

1154 return self.inputs.keys() 

1155 def __contains__(self, item): 

1156 return item in self.inputs 

1157 def __iter__(self): 

1158 return iter(self.inputs.keys()) 

1159 def __len__(self): 

1160 return len(self.inputs) 

1161 

1162 def __repr__(self): 

1163 return '<%s for form %s>' % ( 

1164 self.__class__.__name__, 

1165 self.inputs.form._name()) 

1166 

1167 

1168class InputGetter(object): 

1169 

1170 """ 

1171 An accessor that represents all the input fields in a form. 

1172 

1173 You can get fields by name from this, with 

1174 ``form.inputs['field_name']``. If there are a set of checkboxes 

1175 with the same name, they are returned as a list (a `CheckboxGroup` 

1176 which also allows value setting). Radio inputs are handled 

1177 similarly. Use ``.keys()`` and ``.items()`` to process all fields 

1178 in this way. 

1179 

1180 You can also iterate over this to get all input elements. This 

1181 won't return the same thing as if you get all the names, as 

1182 checkboxes and radio elements are returned individually. 

1183 """ 

1184 

1185 def __init__(self, form): 

1186 self.form = form 

1187 

1188 def __repr__(self): 

1189 return '<%s for form %s>' % ( 

1190 self.__class__.__name__, 

1191 self.form._name()) 

1192 

1193 ## FIXME: there should be more methods, and it's unclear if this is 

1194 ## a dictionary-like object or list-like object 

1195 

1196 def __getitem__(self, name): 

1197 fields = [field for field in self if field.name == name] 

1198 if not fields: 

1199 raise KeyError("No input element with the name %r" % name) 

1200 

1201 input_type = fields[0].get('type') 

1202 if input_type == 'radio' and len(fields) > 1: 

1203 group = RadioGroup(fields) 

1204 group.name = name 

1205 return group 

1206 elif input_type == 'checkbox' and len(fields) > 1: 

1207 group = CheckboxGroup(fields) 

1208 group.name = name 

1209 return group 

1210 else: 

1211 # I don't like throwing away elements like this 

1212 return fields[0] 

1213 

1214 def __contains__(self, name): 

1215 for field in self: 

1216 if field.name == name: 

1217 return True 

1218 return False 

1219 

1220 def keys(self): 

1221 """ 

1222 Returns all unique field names, in document order. 

1223 

1224 :return: A list of all unique field names. 

1225 """ 

1226 names = [] 

1227 seen = {None} 

1228 for el in self: 

1229 name = el.name 

1230 if name not in seen: 

1231 names.append(name) 

1232 seen.add(name) 

1233 return names 

1234 

1235 def items(self): 

1236 """ 

1237 Returns all fields with their names, similar to dict.items(). 

1238 

1239 :return: A list of (name, field) tuples. 

1240 """ 

1241 items = [] 

1242 seen = set() 

1243 for el in self: 

1244 name = el.name 

1245 if name not in seen: 

1246 seen.add(name) 

1247 items.append((name, self[name])) 

1248 return items 

1249 

1250 def __iter__(self): 

1251 return self.form.iter('select', 'input', 'textarea') 

1252 

1253 def __len__(self): 

1254 return sum(1 for _ in self) 

1255 

1256 

1257class InputMixin(object): 

1258 """ 

1259 Mix-in for all input elements (input, select, and textarea) 

1260 """ 

1261 @property 

1262 def name(self): 

1263 """ 

1264 Get/set the name of the element 

1265 """ 

1266 return self.get('name') 

1267 

1268 @name.setter 

1269 def name(self, value): 

1270 self.set('name', value) 

1271 

1272 @name.deleter 

1273 def name(self): 

1274 attrib = self.attrib 

1275 if 'name' in attrib: 

1276 del attrib['name'] 

1277 

1278 def __repr__(self): 

1279 type_name = getattr(self, 'type', None) 

1280 if type_name: 

1281 type_name = ' type=%r' % type_name 

1282 else: 

1283 type_name = '' 

1284 return '<%s %x name=%r%s>' % ( 

1285 self.__class__.__name__, id(self), self.name, type_name) 

1286 

1287 

1288class TextareaElement(InputMixin, HtmlElement): 

1289 """ 

1290 ``<textarea>`` element. You can get the name with ``.name`` and 

1291 get/set the value with ``.value`` 

1292 """ 

1293 @property 

1294 def value(self): 

1295 """ 

1296 Get/set the value (which is the contents of this element) 

1297 """ 

1298 content = self.text or '' 

1299 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 

1300 serialisation_method = 'xml' 

1301 else: 

1302 serialisation_method = 'html' 

1303 for el in self: 

1304 # it's rare that we actually get here, so let's not use ''.join() 

1305 content += etree.tostring( 

1306 el, method=serialisation_method, encoding='unicode') 

1307 return content 

1308 

1309 @value.setter 

1310 def value(self, value): 

1311 del self[:] 

1312 self.text = value 

1313 

1314 @value.deleter 

1315 def value(self): 

1316 self.text = '' 

1317 del self[:] 

1318 

1319 

1320HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 

1321 

1322 

1323class SelectElement(InputMixin, HtmlElement): 

1324 """ 

1325 ``<select>`` element. You can get the name with ``.name``. 

1326 

1327 ``.value`` will be the value of the selected option, unless this 

1328 is a multi-select element (``<select multiple>``), in which case 

1329 it will be a set-like object. In either case ``.value_options`` 

1330 gives the possible values. 

1331 

1332 The boolean attribute ``.multiple`` shows if this is a 

1333 multi-select. 

1334 """ 

1335 @property 

1336 def value(self): 

1337 """ 

1338 Get/set the value of this select (the selected option). 

1339 

1340 If this is a multi-select, this is a set-like object that 

1341 represents all the selected options. 

1342 """ 

1343 if self.multiple: 

1344 return MultipleSelectOptions(self) 

1345 options = _options_xpath(self) 

1346 

1347 try: 

1348 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 

1349 except StopIteration: 

1350 try: 

1351 selected_option = next(el for el in options if el.get('disabled') is None) 

1352 except StopIteration: 

1353 return None 

1354 value = selected_option.get('value') 

1355 if value is None: 

1356 value = (selected_option.text or '').strip() 

1357 return value 

1358 

1359 @value.setter 

1360 def value(self, value): 

1361 if self.multiple: 

1362 if isinstance(value, basestring): 

1363 raise TypeError("You must pass in a sequence") 

1364 values = self.value 

1365 values.clear() 

1366 values.update(value) 

1367 return 

1368 checked_option = None 

1369 if value is not None: 

1370 for el in _options_xpath(self): 

1371 opt_value = el.get('value') 

1372 if opt_value is None: 

1373 opt_value = (el.text or '').strip() 

1374 if opt_value == value: 

1375 checked_option = el 

1376 break 

1377 else: 

1378 raise ValueError( 

1379 "There is no option with the value of %r" % value) 

1380 for el in _options_xpath(self): 

1381 if 'selected' in el.attrib: 

1382 del el.attrib['selected'] 

1383 if checked_option is not None: 

1384 checked_option.set('selected', '') 

1385 

1386 @value.deleter 

1387 def value(self): 

1388 # FIXME: should del be allowed at all? 

1389 if self.multiple: 

1390 self.value.clear() 

1391 else: 

1392 self.value = None 

1393 

1394 @property 

1395 def value_options(self): 

1396 """ 

1397 All the possible values this select can have (the ``value`` 

1398 attribute of all the ``<option>`` elements. 

1399 """ 

1400 options = [] 

1401 for el in _options_xpath(self): 

1402 value = el.get('value') 

1403 if value is None: 

1404 value = (el.text or '').strip() 

1405 options.append(value) 

1406 return options 

1407 

1408 @property 

1409 def multiple(self): 

1410 """ 

1411 Boolean attribute: is there a ``multiple`` attribute on this element. 

1412 """ 

1413 return 'multiple' in self.attrib 

1414 

1415 @multiple.setter 

1416 def multiple(self, value): 

1417 if value: 

1418 self.set('multiple', '') 

1419 elif 'multiple' in self.attrib: 

1420 del self.attrib['multiple'] 

1421 

1422 

1423HtmlElementClassLookup._default_element_classes['select'] = SelectElement 

1424 

1425 

1426class MultipleSelectOptions(SetMixin): 

1427 """ 

1428 Represents all the selected options in a ``<select multiple>`` element. 

1429 

1430 You can add to this set-like option to select an option, or remove 

1431 to unselect the option. 

1432 """ 

1433 

1434 def __init__(self, select): 

1435 self.select = select 

1436 

1437 @property 

1438 def options(self): 

1439 """ 

1440 Iterator of all the ``<option>`` elements. 

1441 """ 

1442 return iter(_options_xpath(self.select)) 

1443 

1444 def __iter__(self): 

1445 for option in self.options: 

1446 if 'selected' in option.attrib: 

1447 opt_value = option.get('value') 

1448 if opt_value is None: 

1449 opt_value = (option.text or '').strip() 

1450 yield opt_value 

1451 

1452 def add(self, item): 

1453 for option in self.options: 

1454 opt_value = option.get('value') 

1455 if opt_value is None: 

1456 opt_value = (option.text or '').strip() 

1457 if opt_value == item: 

1458 option.set('selected', '') 

1459 break 

1460 else: 

1461 raise ValueError( 

1462 "There is no option with the value %r" % item) 

1463 

1464 def remove(self, item): 

1465 for option in self.options: 

1466 opt_value = option.get('value') 

1467 if opt_value is None: 

1468 opt_value = (option.text or '').strip() 

1469 if opt_value == item: 

1470 if 'selected' in option.attrib: 

1471 del option.attrib['selected'] 

1472 else: 

1473 raise ValueError( 

1474 "The option %r is not currently selected" % item) 

1475 break 

1476 else: 

1477 raise ValueError( 

1478 "There is not option with the value %r" % item) 

1479 

1480 def __repr__(self): 

1481 return '<%s {%s} for select name=%r>' % ( 

1482 self.__class__.__name__, 

1483 ', '.join([repr(v) for v in self]), 

1484 self.select.name) 

1485 

1486 

1487class RadioGroup(list): 

1488 """ 

1489 This object represents several ``<input type=radio>`` elements 

1490 that have the same name. 

1491 

1492 You can use this like a list, but also use the property 

1493 ``.value`` to check/uncheck inputs. Also you can use 

1494 ``.value_options`` to get the possible values. 

1495 """ 

1496 @property 

1497 def value(self): 

1498 """ 

1499 Get/set the value, which checks the radio with that value (and 

1500 unchecks any other value). 

1501 """ 

1502 for el in self: 

1503 if 'checked' in el.attrib: 

1504 return el.get('value') 

1505 return None 

1506 

1507 @value.setter 

1508 def value(self, value): 

1509 checked_option = None 

1510 if value is not None: 

1511 for el in self: 

1512 if el.get('value') == value: 

1513 checked_option = el 

1514 break 

1515 else: 

1516 raise ValueError("There is no radio input with the value %r" % value) 

1517 for el in self: 

1518 if 'checked' in el.attrib: 

1519 del el.attrib['checked'] 

1520 if checked_option is not None: 

1521 checked_option.set('checked', '') 

1522 

1523 @value.deleter 

1524 def value(self): 

1525 self.value = None 

1526 

1527 @property 

1528 def value_options(self): 

1529 """ 

1530 Returns a list of all the possible values. 

1531 """ 

1532 return [el.get('value') for el in self] 

1533 

1534 def __repr__(self): 

1535 return '%s(%s)' % ( 

1536 self.__class__.__name__, 

1537 list.__repr__(self)) 

1538 

1539 

1540class CheckboxGroup(list): 

1541 """ 

1542 Represents a group of checkboxes (``<input type=checkbox>``) that 

1543 have the same name. 

1544 

1545 In addition to using this like a list, the ``.value`` attribute 

1546 returns a set-like object that you can add to or remove from to 

1547 check and uncheck checkboxes. You can also use ``.value_options`` 

1548 to get the possible values. 

1549 """ 

1550 @property 

1551 def value(self): 

1552 """ 

1553 Return a set-like object that can be modified to check or 

1554 uncheck individual checkboxes according to their value. 

1555 """ 

1556 return CheckboxValues(self) 

1557 

1558 @value.setter 

1559 def value(self, value): 

1560 values = self.value 

1561 values.clear() 

1562 if not hasattr(value, '__iter__'): 

1563 raise ValueError( 

1564 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 

1565 % (self[0].name, value)) 

1566 values.update(value) 

1567 

1568 @value.deleter 

1569 def value(self): 

1570 self.value.clear() 

1571 

1572 @property 

1573 def value_options(self): 

1574 """ 

1575 Returns a list of all the possible values. 

1576 """ 

1577 return [el.get('value') for el in self] 

1578 

1579 def __repr__(self): 

1580 return '%s(%s)' % ( 

1581 self.__class__.__name__, list.__repr__(self)) 

1582 

1583 

1584class CheckboxValues(SetMixin): 

1585 """ 

1586 Represents the values of the checked checkboxes in a group of 

1587 checkboxes with the same name. 

1588 """ 

1589 

1590 def __init__(self, group): 

1591 self.group = group 

1592 

1593 def __iter__(self): 

1594 return iter([ 

1595 el.get('value') 

1596 for el in self.group 

1597 if 'checked' in el.attrib]) 

1598 

1599 def add(self, value): 

1600 for el in self.group: 

1601 if el.get('value') == value: 

1602 el.set('checked', '') 

1603 break 

1604 else: 

1605 raise KeyError("No checkbox with value %r" % value) 

1606 

1607 def remove(self, value): 

1608 for el in self.group: 

1609 if el.get('value') == value: 

1610 if 'checked' in el.attrib: 

1611 del el.attrib['checked'] 

1612 else: 

1613 raise KeyError( 

1614 "The checkbox with value %r was already unchecked" % value) 

1615 break 

1616 else: 

1617 raise KeyError( 

1618 "No checkbox with value %r" % value) 

1619 

1620 def __repr__(self): 

1621 return '<%s {%s} for checkboxes name=%r>' % ( 

1622 self.__class__.__name__, 

1623 ', '.join([repr(v) for v in self]), 

1624 self.group.name) 

1625 

1626 

1627class InputElement(InputMixin, HtmlElement): 

1628 """ 

1629 Represents an ``<input>`` element. 

1630 

1631 You can get the type with ``.type`` (which is lower-cased and 

1632 defaults to ``'text'``). 

1633 

1634 Also you can get and set the value with ``.value`` 

1635 

1636 Checkboxes and radios have the attribute ``input.checkable == 

1637 True`` (for all others it is false) and a boolean attribute 

1638 ``.checked``. 

1639 

1640 """ 

1641 

1642 ## FIXME: I'm a little uncomfortable with the use of .checked 

1643 @property 

1644 def value(self): 

1645 """ 

1646 Get/set the value of this element, using the ``value`` attribute. 

1647 

1648 Also, if this is a checkbox and it has no value, this defaults 

1649 to ``'on'``. If it is a checkbox or radio that is not 

1650 checked, this returns None. 

1651 """ 

1652 if self.checkable: 

1653 if self.checked: 

1654 return self.get('value') or 'on' 

1655 else: 

1656 return None 

1657 return self.get('value') 

1658 

1659 @value.setter 

1660 def value(self, value): 

1661 if self.checkable: 

1662 if not value: 

1663 self.checked = False 

1664 else: 

1665 self.checked = True 

1666 if isinstance(value, basestring): 

1667 self.set('value', value) 

1668 else: 

1669 self.set('value', value) 

1670 

1671 @value.deleter 

1672 def value(self): 

1673 if self.checkable: 

1674 self.checked = False 

1675 else: 

1676 if 'value' in self.attrib: 

1677 del self.attrib['value'] 

1678 

1679 @property 

1680 def type(self): 

1681 """ 

1682 Return the type of this element (using the type attribute). 

1683 """ 

1684 return self.get('type', 'text').lower() 

1685 

1686 @type.setter 

1687 def type(self, value): 

1688 self.set('type', value) 

1689 

1690 @property 

1691 def checkable(self): 

1692 """ 

1693 Boolean: can this element be checked? 

1694 """ 

1695 return self.type in ('checkbox', 'radio') 

1696 

1697 @property 

1698 def checked(self): 

1699 """ 

1700 Boolean attribute to get/set the presence of the ``checked`` 

1701 attribute. 

1702 

1703 You can only use this on checkable input types. 

1704 """ 

1705 if not self.checkable: 

1706 raise AttributeError('Not a checkable input type') 

1707 return 'checked' in self.attrib 

1708 

1709 @checked.setter 

1710 def checked(self, value): 

1711 if not self.checkable: 

1712 raise AttributeError('Not a checkable input type') 

1713 if value: 

1714 self.set('checked', '') 

1715 else: 

1716 attrib = self.attrib 

1717 if 'checked' in attrib: 

1718 del attrib['checked'] 

1719 

1720 

1721HtmlElementClassLookup._default_element_classes['input'] = InputElement 

1722 

1723 

1724class LabelElement(HtmlElement): 

1725 """ 

1726 Represents a ``<label>`` element. 

1727 

1728 Label elements are linked to other elements with their ``for`` 

1729 attribute. You can access this element with ``label.for_element``. 

1730 """ 

1731 @property 

1732 def for_element(self): 

1733 """ 

1734 Get/set the element this label points to. Return None if it 

1735 can't be found. 

1736 """ 

1737 id = self.get('for') 

1738 if not id: 

1739 return None 

1740 return self.body.get_element_by_id(id) 

1741 

1742 @for_element.setter 

1743 def for_element(self, other): 

1744 id = other.get('id') 

1745 if not id: 

1746 raise TypeError( 

1747 "Element %r has no id attribute" % other) 

1748 self.set('for', id) 

1749 

1750 @for_element.deleter 

1751 def for_element(self): 

1752 attrib = self.attrib 

1753 if 'id' in attrib: 

1754 del attrib['id'] 

1755 

1756 

1757HtmlElementClassLookup._default_element_classes['label'] = LabelElement 

1758 

1759 

1760############################################################ 

1761## Serialization 

1762############################################################ 

1763 

1764def html_to_xhtml(html): 

1765 """Convert all tags in an HTML tree to XHTML by moving them to the 

1766 XHTML namespace. 

1767 """ 

1768 try: 

1769 html = html.getroot() 

1770 except AttributeError: 

1771 pass 

1772 prefix = "{%s}" % XHTML_NAMESPACE 

1773 for el in html.iter(etree.Element): 

1774 tag = el.tag 

1775 if tag[0] != '{': 

1776 el.tag = prefix + tag 

1777 

1778 

1779def xhtml_to_html(xhtml): 

1780 """Convert all tags in an XHTML tree to HTML by removing their 

1781 XHTML namespace. 

1782 """ 

1783 try: 

1784 xhtml = xhtml.getroot() 

1785 except AttributeError: 

1786 pass 

1787 prefix = "{%s}" % XHTML_NAMESPACE 

1788 prefix_len = len(prefix) 

1789 for el in xhtml.iter(prefix + "*"): 

1790 el.tag = el.tag[prefix_len:] 

1791 

1792 

1793# This isn't a general match, but it's a match for what libxml2 

1794# specifically serialises: 

1795__str_replace_meta_content_type = re.compile( 

1796 r'<meta http-equiv="Content-Type"[^>]*>').sub 

1797__bytes_replace_meta_content_type = re.compile( 

1798 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 

1799 

1800 

1801def tostring(doc, pretty_print=False, include_meta_content_type=False, 

1802 encoding=None, method="html", with_tail=True, doctype=None): 

1803 """Return an HTML string representation of the document. 

1804 

1805 Note: if include_meta_content_type is true this will create a 

1806 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 

1807 regardless of the value of include_meta_content_type any existing 

1808 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 

1809 

1810 The ``encoding`` argument controls the output encoding (defaults to 

1811 ASCII, with &#...; character references for any characters outside 

1812 of ASCII). Note that you can pass the name ``'unicode'`` as 

1813 ``encoding`` argument to serialise to a Unicode string. 

1814 

1815 The ``method`` argument defines the output method. It defaults to 

1816 'html', but can also be 'xml' for xhtml output, or 'text' to 

1817 serialise to plain text without markup. 

1818 

1819 To leave out the tail text of the top-level element that is being 

1820 serialised, pass ``with_tail=False``. 

1821 

1822 The ``doctype`` option allows passing in a plain string that will 

1823 be serialised before the XML tree. Note that passing in non 

1824 well-formed content here will make the XML output non well-formed. 

1825 Also, an existing doctype in the document tree will not be removed 

1826 when serialising an ElementTree instance. 

1827 

1828 Example:: 

1829 

1830 >>> from lxml import html 

1831 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 

1832 

1833 >>> html.tostring(root) 

1834 b'<p>Hello<br>world!</p>' 

1835 >>> html.tostring(root, method='html') 

1836 b'<p>Hello<br>world!</p>' 

1837 

1838 >>> html.tostring(root, method='xml') 

1839 b'<p>Hello<br/>world!</p>' 

1840 

1841 >>> html.tostring(root, method='text') 

1842 b'Helloworld!' 

1843 

1844 >>> html.tostring(root, method='text', encoding='unicode') 

1845 u'Helloworld!' 

1846 

1847 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 

1848 >>> html.tostring(root[0], method='text', encoding='unicode') 

1849 u'Helloworld!TAIL' 

1850 

1851 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 

1852 u'Helloworld!' 

1853 

1854 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 

1855 >>> html.tostring(doc, method='html', encoding='unicode') 

1856 u'<html><body><p>Hello<br>world!</p></body></html>' 

1857 

1858 >>> print(html.tostring(doc, method='html', encoding='unicode', 

1859 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 

1860 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 

1861 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 

1862 <html><body><p>Hello<br>world!</p></body></html> 

1863 """ 

1864 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 

1865 encoding=encoding, with_tail=with_tail, 

1866 doctype=doctype) 

1867 if method == 'html' and not include_meta_content_type: 

1868 if isinstance(html, str): 

1869 html = __str_replace_meta_content_type('', html) 

1870 else: 

1871 html = __bytes_replace_meta_content_type(bytes(), html) 

1872 return html 

1873 

1874 

1875tostring.__doc__ = __fix_docstring(tostring.__doc__) 

1876 

1877 

1878def open_in_browser(doc, encoding=None): 

1879 """ 

1880 Open the HTML document in a web browser, saving it to a temporary 

1881 file to open it. Note that this does not delete the file after 

1882 use. This is mainly meant for debugging. 

1883 """ 

1884 import os 

1885 import webbrowser 

1886 import tempfile 

1887 if not isinstance(doc, etree._ElementTree): 

1888 doc = etree.ElementTree(doc) 

1889 handle, fn = tempfile.mkstemp(suffix='.html') 

1890 f = os.fdopen(handle, 'wb') 

1891 try: 

1892 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 

1893 finally: 

1894 # we leak the file itself here, but we should at least close it 

1895 f.close() 

1896 url = 'file://' + fn.replace(os.path.sep, '/') 

1897 print(url) 

1898 webbrowser.open(url) 

1899 

1900 

1901################################################################################ 

1902# configure Element class lookup 

1903################################################################################ 

1904 

1905class HTMLParser(etree.HTMLParser): 

1906 """An HTML parser that is configured to return lxml.html Element 

1907 objects. 

1908 """ 

1909 def __init__(self, **kwargs): 

1910 super(HTMLParser, self).__init__(**kwargs) 

1911 self.set_element_class_lookup(HtmlElementClassLookup()) 

1912 

1913 

1914class XHTMLParser(etree.XMLParser): 

1915 """An XML parser that is configured to return lxml.html Element 

1916 objects. 

1917 

1918 Note that this parser is not really XHTML aware unless you let it 

1919 load a DTD that declares the HTML entities. To do this, make sure 

1920 you have the XHTML DTDs installed in your catalogs, and create the 

1921 parser like this:: 

1922 

1923 >>> parser = XHTMLParser(load_dtd=True) 

1924 

1925 If you additionally want to validate the document, use this:: 

1926 

1927 >>> parser = XHTMLParser(dtd_validation=True) 

1928 

1929 For catalog support, see http://www.xmlsoft.org/catalog.html. 

1930 """ 

1931 def __init__(self, **kwargs): 

1932 super(XHTMLParser, self).__init__(**kwargs) 

1933 self.set_element_class_lookup(HtmlElementClassLookup()) 

1934 

1935 

1936def Element(*args, **kw): 

1937 """Create a new HTML Element. 

1938 

1939 This can also be used for XHTML documents. 

1940 """ 

1941 v = html_parser.makeelement(*args, **kw) 

1942 return v 

1943 

1944 

1945html_parser = HTMLParser() 

1946xhtml_parser = XHTMLParser()