Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lxml/html/__init__.py: 4%
985 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:27 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:27 +0000
1# Copyright (c) 2004 Ian Bicking. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# 1. Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# 2. Redistributions in binary form must reproduce the above copyright
11# notice, this list of conditions and the following disclaimer in
12# the documentation and/or other materials provided with the
13# distribution.
14#
15# 3. Neither the name of Ian Bicking nor the names of its contributors may
16# be used to endorse or promote products derived from this software
17# without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31"""The ``lxml.html`` tool set for HTML handling.
32"""
34from __future__ import absolute_import
36__all__ = [
37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39 'find_rel_links', 'find_class', 'make_links_absolute',
40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
43import copy
44import sys
45import re
46from functools import partial
48try:
49 from collections.abc import MutableMapping, MutableSet
50except ImportError:
51 from collections import MutableMapping, MutableSet
53from .. import etree
54from . import defs
55from ._setmixin import SetMixin
57try:
58 from urlparse import urljoin
59except ImportError:
60 # Python 3
61 from urllib.parse import urljoin
63try:
64 unicode
65except NameError:
66 # Python 3
67 unicode = str
68try:
69 basestring
70except NameError:
71 # Python 3
72 basestring = (str, bytes)
75def __fix_docstring(s):
76 if not s:
77 return s
78 if sys.version_info[0] >= 3:
79 sub = re.compile(r"^(\s*)u'", re.M).sub
80 else:
81 sub = re.compile(r"^(\s*)b'", re.M).sub
82 return sub(r"\1'", s)
85XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
87_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
88 namespaces={'x':XHTML_NAMESPACE})
89_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
90 namespaces={'x':XHTML_NAMESPACE})
91_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
92 namespaces={'x':XHTML_NAMESPACE})
93#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
94_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
95_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
96_collect_string_content = etree.XPath("string()")
97_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
98_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
99_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
100 namespaces={'x':XHTML_NAMESPACE})
101_archive_re = re.compile(r'[^ ]+')
102_parse_meta_refresh_url = re.compile(
103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
106def _unquote_match(s, pos):
107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
108 return s[1:-1], pos+1
109 else:
110 return s,pos
113def _transform_result(typ, result):
114 """Convert the result back into the input type.
115 """
116 if issubclass(typ, bytes):
117 return tostring(result, encoding='utf-8')
118 elif issubclass(typ, unicode):
119 return tostring(result, encoding='unicode')
120 else:
121 return result
124def _nons(tag):
125 if isinstance(tag, basestring):
126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
127 return tag.split('}')[-1]
128 return tag
131class Classes(MutableSet):
132 """Provides access to an element's class attribute as a set-like collection.
133 Usage::
135 >>> el = fromstring('<p class="hidden large">Text</p>')
136 >>> classes = el.classes # or: classes = Classes(el.attrib)
137 >>> classes |= ['block', 'paragraph']
138 >>> el.get('class')
139 'hidden large block paragraph'
140 >>> classes.toggle('hidden')
141 False
142 >>> el.get('class')
143 'large block paragraph'
144 >>> classes -= ('some', 'classes', 'block')
145 >>> el.get('class')
146 'large paragraph'
147 """
148 def __init__(self, attributes):
149 self._attributes = attributes
150 self._get_class_value = partial(attributes.get, 'class', '')
152 def add(self, value):
153 """
154 Add a class.
156 This has no effect if the class is already present.
157 """
158 if not value or re.search(r'\s', value):
159 raise ValueError("Invalid class name: %r" % value)
160 classes = self._get_class_value().split()
161 if value in classes:
162 return
163 classes.append(value)
164 self._attributes['class'] = ' '.join(classes)
166 def discard(self, value):
167 """
168 Remove a class if it is currently present.
170 If the class is not present, do nothing.
171 """
172 if not value or re.search(r'\s', value):
173 raise ValueError("Invalid class name: %r" % value)
174 classes = [name for name in self._get_class_value().split()
175 if name != value]
176 if classes:
177 self._attributes['class'] = ' '.join(classes)
178 elif 'class' in self._attributes:
179 del self._attributes['class']
181 def remove(self, value):
182 """
183 Remove a class; it must currently be present.
185 If the class is not present, raise a KeyError.
186 """
187 if not value or re.search(r'\s', value):
188 raise ValueError("Invalid class name: %r" % value)
189 super(Classes, self).remove(value)
191 def __contains__(self, name):
192 classes = self._get_class_value()
193 return name in classes and name in classes.split()
195 def __iter__(self):
196 return iter(self._get_class_value().split())
198 def __len__(self):
199 return len(self._get_class_value().split())
201 # non-standard methods
203 def update(self, values):
204 """
205 Add all names from 'values'.
206 """
207 classes = self._get_class_value().split()
208 extended = False
209 for value in values:
210 if value not in classes:
211 classes.append(value)
212 extended = True
213 if extended:
214 self._attributes['class'] = ' '.join(classes)
216 def toggle(self, value):
217 """
218 Add a class name if it isn't there yet, or remove it if it exists.
220 Returns true if the class was added (and is now enabled) and
221 false if it was removed (and is now disabled).
222 """
223 if not value or re.search(r'\s', value):
224 raise ValueError("Invalid class name: %r" % value)
225 classes = self._get_class_value().split()
226 try:
227 classes.remove(value)
228 enabled = False
229 except ValueError:
230 classes.append(value)
231 enabled = True
232 if classes:
233 self._attributes['class'] = ' '.join(classes)
234 else:
235 del self._attributes['class']
236 return enabled
239class HtmlMixin(object):
241 def set(self, key, value=None):
242 """set(self, key, value=None)
244 Sets an element attribute. If no value is provided, or if the value is None,
245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
246 for ``form.set('novalidate')``.
247 """
248 super(HtmlMixin, self).set(key, value)
250 @property
251 def classes(self):
252 """
253 A set-like wrapper around the 'class' attribute.
254 """
255 return Classes(self.attrib)
257 @classes.setter
258 def classes(self, classes):
259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
260 value = classes._get_class_value()
261 if value:
262 self.set('class', value)
263 elif self.get('class') is not None:
264 del self.attrib['class']
266 @property
267 def base_url(self):
268 """
269 Returns the base URL, given when the page was parsed.
271 Use with ``urlparse.urljoin(el.base_url, href)`` to get
272 absolute URLs.
273 """
274 return self.getroottree().docinfo.URL
276 @property
277 def forms(self):
278 """
279 Return a list of all the forms
280 """
281 return _forms_xpath(self)
283 @property
284 def body(self):
285 """
286 Return the <body> element. Can be called from a child element
287 to get the document's head.
288 """
289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
291 @property
292 def head(self):
293 """
294 Returns the <head> element. Can be called from a child
295 element to get the document's head.
296 """
297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
299 @property
300 def label(self):
301 """
302 Get or set any <label> element associated with this element.
303 """
304 id = self.get('id')
305 if not id:
306 return None
307 result = _label_xpath(self, id=id)
308 if not result:
309 return None
310 else:
311 return result[0]
313 @label.setter
314 def label(self, label):
315 id = self.get('id')
316 if not id:
317 raise TypeError(
318 "You cannot set a label for an element (%r) that has no id"
319 % self)
320 if _nons(label.tag) != 'label':
321 raise TypeError(
322 "You can only assign label to a label element (not %r)"
323 % label)
324 label.set('for', id)
326 @label.deleter
327 def label(self):
328 label = self.label
329 if label is not None:
330 del label.attrib['for']
332 def drop_tree(self):
333 """
334 Removes this element from the tree, including its children and
335 text. The tail text is joined to the previous element or
336 parent.
337 """
338 parent = self.getparent()
339 assert parent is not None
340 if self.tail:
341 previous = self.getprevious()
342 if previous is None:
343 parent.text = (parent.text or '') + self.tail
344 else:
345 previous.tail = (previous.tail or '') + self.tail
346 parent.remove(self)
348 def drop_tag(self):
349 """
350 Remove the tag, but not its children or text. The children and text
351 are merged into the parent.
353 Example::
355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
356 >>> h.find('.//b').drop_tag()
357 >>> print(tostring(h, encoding='unicode'))
358 <div>Hello World!</div>
359 """
360 parent = self.getparent()
361 assert parent is not None
362 previous = self.getprevious()
363 if self.text and isinstance(self.tag, basestring):
364 # not a Comment, etc.
365 if previous is None:
366 parent.text = (parent.text or '') + self.text
367 else:
368 previous.tail = (previous.tail or '') + self.text
369 if self.tail:
370 if len(self):
371 last = self[-1]
372 last.tail = (last.tail or '') + self.tail
373 elif previous is None:
374 parent.text = (parent.text or '') + self.tail
375 else:
376 previous.tail = (previous.tail or '') + self.tail
377 index = parent.index(self)
378 parent[index:index+1] = self[:]
380 def find_rel_links(self, rel):
381 """
382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
383 """
384 rel = rel.lower()
385 return [el for el in _rel_links_xpath(self)
386 if el.get('rel').lower() == rel]
388 def find_class(self, class_name):
389 """
390 Find any elements with the given class name.
391 """
392 return _class_xpath(self, class_name=class_name)
394 def get_element_by_id(self, id, *default):
395 """
396 Get the first element in a document with the given id. If none is
397 found, return the default argument if provided or raise KeyError
398 otherwise.
400 Note that there can be more than one element with the same id,
401 and this isn't uncommon in HTML documents found in the wild.
402 Browsers return only the first match, and this function does
403 the same.
404 """
405 try:
406 # FIXME: should this check for multiple matches?
407 # browsers just return the first one
408 return _id_xpath(self, id=id)[0]
409 except IndexError:
410 if default:
411 return default[0]
412 else:
413 raise KeyError(id)
415 def text_content(self):
416 """
417 Return the text content of the tag (and the text in any children).
418 """
419 return _collect_string_content(self)
421 def cssselect(self, expr, translator='html'):
422 """
423 Run the CSS expression on this element and its children,
424 returning a list of the results.
426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
427 -- note that pre-compiling the expression can provide a substantial
428 speedup.
429 """
430 # Do the import here to make the dependency optional.
431 from lxml.cssselect import CSSSelector
432 return CSSSelector(expr, translator=translator)(self)
434 ########################################
435 ## Link functions
436 ########################################
438 def make_links_absolute(self, base_url=None, resolve_base_href=True,
439 handle_failures=None):
440 """
441 Make all links in the document absolute, given the
442 ``base_url`` for the document (the full URL where the document
443 came from), or if no ``base_url`` is given, then the ``.base_url``
444 of the document.
446 If ``resolve_base_href`` is true, then any ``<base href>``
447 tags in the document are used *and* removed from the document.
448 If it is false then any such tag is ignored.
450 If ``handle_failures`` is None (default), a failure to process
451 a URL will abort the processing. If set to 'ignore', errors
452 are ignored. If set to 'discard', failing URLs will be removed.
453 """
454 if base_url is None:
455 base_url = self.base_url
456 if base_url is None:
457 raise TypeError(
458 "No base_url given, and the document has no base_url")
459 if resolve_base_href:
460 self.resolve_base_href()
462 if handle_failures == 'ignore':
463 def link_repl(href):
464 try:
465 return urljoin(base_url, href)
466 except ValueError:
467 return href
468 elif handle_failures == 'discard':
469 def link_repl(href):
470 try:
471 return urljoin(base_url, href)
472 except ValueError:
473 return None
474 elif handle_failures is None:
475 def link_repl(href):
476 return urljoin(base_url, href)
477 else:
478 raise ValueError(
479 "unexpected value for handle_failures: %r" % handle_failures)
481 self.rewrite_links(link_repl)
483 def resolve_base_href(self, handle_failures=None):
484 """
485 Find any ``<base href>`` tag in the document, and apply its
486 values to all links found in the document. Also remove the
487 tag once it has been applied.
489 If ``handle_failures`` is None (default), a failure to process
490 a URL will abort the processing. If set to 'ignore', errors
491 are ignored. If set to 'discard', failing URLs will be removed.
492 """
493 base_href = None
494 basetags = self.xpath('//base[@href]|//x:base[@href]',
495 namespaces={'x': XHTML_NAMESPACE})
496 for b in basetags:
497 base_href = b.get('href')
498 b.drop_tree()
499 if not base_href:
500 return
501 self.make_links_absolute(base_href, resolve_base_href=False,
502 handle_failures=handle_failures)
504 def iterlinks(self):
505 """
506 Yield (element, attribute, link, pos), where attribute may be None
507 (indicating the link is in the text). ``pos`` is the position
508 where the link occurs; often 0, but sometimes something else in
509 the case of links in stylesheets or style tags.
511 Note: <base href> is *not* taken into account in any way. The
512 link you get is exactly the link in the document.
514 Note: multiple links inside of a single text string or
515 attribute value are returned in reversed order. This makes it
516 possible to replace or delete them from the text string value
517 based on their reported text positions. Otherwise, a
518 modification at one text position can change the positions of
519 links reported later on.
520 """
521 link_attrs = defs.link_attrs
522 for el in self.iter(etree.Element):
523 attribs = el.attrib
524 tag = _nons(el.tag)
525 if tag == 'object':
526 codebase = None
527 ## <object> tags have attributes that are relative to
528 ## codebase
529 if 'codebase' in attribs:
530 codebase = el.get('codebase')
531 yield (el, 'codebase', codebase, 0)
532 for attrib in ('classid', 'data'):
533 if attrib in attribs:
534 value = el.get(attrib)
535 if codebase is not None:
536 value = urljoin(codebase, value)
537 yield (el, attrib, value, 0)
538 if 'archive' in attribs:
539 for match in _archive_re.finditer(el.get('archive')):
540 value = match.group(0)
541 if codebase is not None:
542 value = urljoin(codebase, value)
543 yield (el, 'archive', value, match.start())
544 else:
545 for attrib in link_attrs:
546 if attrib in attribs:
547 yield (el, attrib, attribs[attrib], 0)
548 if tag == 'meta':
549 http_equiv = attribs.get('http-equiv', '').lower()
550 if http_equiv == 'refresh':
551 content = attribs.get('content', '')
552 match = _parse_meta_refresh_url(content)
553 url = (match.group('url') if match else content).strip()
554 # unexpected content means the redirect won't work, but we might
555 # as well be permissive and return the entire string.
556 if url:
557 url, pos = _unquote_match(
558 url, match.start('url') if match else content.find(url))
559 yield (el, 'content', url, pos)
560 elif tag == 'param':
561 valuetype = el.get('valuetype') or ''
562 if valuetype.lower() == 'ref':
563 ## FIXME: while it's fine we *find* this link,
564 ## according to the spec we aren't supposed to
565 ## actually change the value, including resolving
566 ## it. It can also still be a link, even if it
567 ## doesn't have a valuetype="ref" (which seems to be the norm)
568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
569 yield (el, 'value', el.get('value'), 0)
570 elif tag == 'style' and el.text:
571 urls = [
572 # (start_pos, url)
573 _unquote_match(match.group(1), match.start(1))[::-1]
574 for match in _iter_css_urls(el.text)
575 ] + [
576 (match.start(1), match.group(1))
577 for match in _iter_css_imports(el.text)
578 ]
579 if urls:
580 # sort by start pos to bring both match sets back into order
581 # and reverse the list to report correct positions despite
582 # modifications
583 urls.sort(reverse=True)
584 for start, url in urls:
585 yield (el, None, url, start)
586 if 'style' in attribs:
587 urls = list(_iter_css_urls(attribs['style']))
588 if urls:
589 # return in reversed order to simplify in-place modifications
590 for match in urls[::-1]:
591 url, start = _unquote_match(match.group(1), match.start(1))
592 yield (el, 'style', url, start)
594 def rewrite_links(self, link_repl_func, resolve_base_href=True,
595 base_href=None):
596 """
597 Rewrite all the links in the document. For each link
598 ``link_repl_func(link)`` will be called, and the return value
599 will replace the old link.
601 Note that links may not be absolute (unless you first called
602 ``make_links_absolute()``), and may be internal (e.g.,
603 ``'#anchor'``). They can also be values like
604 ``'mailto:email'`` or ``'javascript:expr'``.
606 If you give ``base_href`` then all links passed to
607 ``link_repl_func()`` will take that into account.
609 If the ``link_repl_func`` returns None, the attribute or
610 tag text will be removed completely.
611 """
612 if base_href is not None:
613 # FIXME: this can be done in one pass with a wrapper
614 # around link_repl_func
615 self.make_links_absolute(
616 base_href, resolve_base_href=resolve_base_href)
617 elif resolve_base_href:
618 self.resolve_base_href()
620 for el, attrib, link, pos in self.iterlinks():
621 new_link = link_repl_func(link.strip())
622 if new_link == link:
623 continue
624 if new_link is None:
625 # Remove the attribute or element content
626 if attrib is None:
627 el.text = ''
628 else:
629 del el.attrib[attrib]
630 continue
632 if attrib is None:
633 new = el.text[:pos] + new_link + el.text[pos+len(link):]
634 el.text = new
635 else:
636 cur = el.get(attrib)
637 if not pos and len(cur) == len(link):
638 new = new_link # most common case
639 else:
640 new = cur[:pos] + new_link + cur[pos+len(link):]
641 el.set(attrib, new)
644class _MethodFunc(object):
645 """
646 An object that represents a method on an element as a function;
647 the function takes either an element or an HTML string. It
648 returns whatever the function normally returns, or if the function
649 works in-place (and so returns None) it returns a serialized form
650 of the resulting document.
651 """
652 def __init__(self, name, copy=False, source_class=HtmlMixin):
653 self.name = name
654 self.copy = copy
655 self.__doc__ = getattr(source_class, self.name).__doc__
656 def __call__(self, doc, *args, **kw):
657 result_type = type(doc)
658 if isinstance(doc, basestring):
659 if 'copy' in kw:
660 raise TypeError(
661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
662 doc = fromstring(doc, **kw)
663 else:
664 if 'copy' in kw:
665 make_a_copy = kw.pop('copy')
666 else:
667 make_a_copy = self.copy
668 if make_a_copy:
669 doc = copy.deepcopy(doc)
670 meth = getattr(doc, self.name)
671 result = meth(*args, **kw)
672 # FIXME: this None test is a bit sloppy
673 if result is None:
674 # Then return what we got in
675 return _transform_result(result_type, doc)
676 else:
677 return result
680find_rel_links = _MethodFunc('find_rel_links', copy=False)
681find_class = _MethodFunc('find_class', copy=False)
682make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
683resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
684iterlinks = _MethodFunc('iterlinks', copy=False)
685rewrite_links = _MethodFunc('rewrite_links', copy=True)
688class HtmlComment(HtmlMixin, etree.CommentBase):
689 pass
692class HtmlElement(HtmlMixin, etree.ElementBase):
693 pass
696class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):
697 pass
700class HtmlEntity(HtmlMixin, etree.EntityBase):
701 pass
704class HtmlElementClassLookup(etree.CustomElementClassLookup):
705 """A lookup scheme for HTML Element classes.
707 To create a lookup instance with different Element classes, pass a tag
708 name mapping of Element classes in the ``classes`` keyword argument and/or
709 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
710 The special key '*' denotes a Mixin class that should be mixed into all
711 Element classes.
712 """
713 _default_element_classes = {}
715 def __init__(self, classes=None, mixins=None):
716 etree.CustomElementClassLookup.__init__(self)
717 if classes is None:
718 classes = self._default_element_classes.copy()
719 if mixins:
720 mixers = {}
721 for name, value in mixins:
722 if name == '*':
723 for n in classes.keys():
724 mixers.setdefault(n, []).append(value)
725 else:
726 mixers.setdefault(name, []).append(value)
727 for name, mix_bases in mixers.items():
728 cur = classes.get(name, HtmlElement)
729 bases = tuple(mix_bases + [cur])
730 classes[name] = type(cur.__name__, bases, {})
731 self._element_classes = classes
733 def lookup(self, node_type, document, namespace, name):
734 if node_type == 'element':
735 return self._element_classes.get(name.lower(), HtmlElement)
736 elif node_type == 'comment':
737 return HtmlComment
738 elif node_type == 'PI':
739 return HtmlProcessingInstruction
740 elif node_type == 'entity':
741 return HtmlEntity
742 # Otherwise normal lookup
743 return None
746################################################################################
747# parsing
748################################################################################
750_looks_like_full_html_unicode = re.compile(
751 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
752_looks_like_full_html_bytes = re.compile(
753 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
756def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
757 if parser is None:
758 parser = html_parser
759 value = etree.fromstring(html, parser, **kw)
760 if value is None:
761 raise etree.ParserError(
762 "Document is empty")
763 if ensure_head_body and value.find('head') is None:
764 value.insert(0, Element('head'))
765 if ensure_head_body and value.find('body') is None:
766 value.append(Element('body'))
767 return value
770def fragments_fromstring(html, no_leading_text=False, base_url=None,
771 parser=None, **kw):
772 """Parses several HTML elements, returning a list of elements.
774 The first item in the list may be a string.
775 If no_leading_text is true, then it will be an error if there is
776 leading text, and it will always be a list of only elements.
778 base_url will set the document's base_url attribute
779 (and the tree's docinfo.URL).
780 """
781 if parser is None:
782 parser = html_parser
783 # FIXME: check what happens when you give html with a body, head, etc.
784 if isinstance(html, bytes):
785 if not _looks_like_full_html_bytes(html):
786 # can't use %-formatting in early Py3 versions
787 html = ('<html><body>'.encode('ascii') + html +
788 '</body></html>'.encode('ascii'))
789 else:
790 if not _looks_like_full_html_unicode(html):
791 html = '<html><body>%s</body></html>' % html
792 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
793 assert _nons(doc.tag) == 'html'
794 bodies = [e for e in doc if _nons(e.tag) == 'body']
795 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
796 body = bodies[0]
797 elements = []
798 if no_leading_text and body.text and body.text.strip():
799 raise etree.ParserError(
800 "There is leading text: %r" % body.text)
801 if body.text and body.text.strip():
802 elements.append(body.text)
803 elements.extend(body)
804 # FIXME: removing the reference to the parent artificial document
805 # would be nice
806 return elements
809def fragment_fromstring(html, create_parent=False, base_url=None,
810 parser=None, **kw):
811 """
812 Parses a single HTML element; it is an error if there is more than
813 one element, or if anything but whitespace precedes or follows the
814 element.
816 If ``create_parent`` is true (or is a tag name) then a parent node
817 will be created to encapsulate the HTML in a single element. In this
818 case, leading or trailing text is also allowed, as are multiple elements
819 as result of the parsing.
821 Passing a ``base_url`` will set the document's ``base_url`` attribute
822 (and the tree's docinfo.URL).
823 """
824 if parser is None:
825 parser = html_parser
827 accept_leading_text = bool(create_parent)
829 elements = fragments_fromstring(
830 html, parser=parser, no_leading_text=not accept_leading_text,
831 base_url=base_url, **kw)
833 if create_parent:
834 if not isinstance(create_parent, basestring):
835 create_parent = 'div'
836 new_root = Element(create_parent)
837 if elements:
838 if isinstance(elements[0], basestring):
839 new_root.text = elements[0]
840 del elements[0]
841 new_root.extend(elements)
842 return new_root
844 if not elements:
845 raise etree.ParserError('No elements found')
846 if len(elements) > 1:
847 raise etree.ParserError(
848 "Multiple elements found (%s)"
849 % ', '.join([_element_name(e) for e in elements]))
850 el = elements[0]
851 if el.tail and el.tail.strip():
852 raise etree.ParserError(
853 "Element followed by text: %r" % el.tail)
854 el.tail = None
855 return el
858def fromstring(html, base_url=None, parser=None, **kw):
859 """
860 Parse the html, returning a single element/document.
862 This tries to minimally parse the chunk of text, without knowing if it
863 is a fragment or a document.
865 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
866 """
867 if parser is None:
868 parser = html_parser
869 if isinstance(html, bytes):
870 is_full_html = _looks_like_full_html_bytes(html)
871 else:
872 is_full_html = _looks_like_full_html_unicode(html)
873 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
874 if is_full_html:
875 return doc
876 # otherwise, lets parse it out...
877 bodies = doc.findall('body')
878 if not bodies:
879 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
880 if bodies:
881 body = bodies[0]
882 if len(bodies) > 1:
883 # Somehow there are multiple bodies, which is bad, but just
884 # smash them into one body
885 for other_body in bodies[1:]:
886 if other_body.text:
887 if len(body):
888 body[-1].tail = (body[-1].tail or '') + other_body.text
889 else:
890 body.text = (body.text or '') + other_body.text
891 body.extend(other_body)
892 # We'll ignore tail
893 # I guess we are ignoring attributes too
894 other_body.drop_tree()
895 else:
896 body = None
897 heads = doc.findall('head')
898 if not heads:
899 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
900 if heads:
901 # Well, we have some sort of structure, so lets keep it all
902 head = heads[0]
903 if len(heads) > 1:
904 for other_head in heads[1:]:
905 head.extend(other_head)
906 # We don't care about text or tail in a head
907 other_head.drop_tree()
908 return doc
909 if body is None:
910 return doc
911 if (len(body) == 1 and (not body.text or not body.text.strip())
912 and (not body[-1].tail or not body[-1].tail.strip())):
913 # The body has just one element, so it was probably a single
914 # element passed in
915 return body[0]
916 # Now we have a body which represents a bunch of tags which have the
917 # content that was passed in. We will create a fake container, which
918 # is the body tag, except <body> implies too much structure.
919 if _contains_block_level_tag(body):
920 body.tag = 'div'
921 else:
922 body.tag = 'span'
923 return body
926def parse(filename_or_url, parser=None, base_url=None, **kw):
927 """
928 Parse a filename, URL, or file-like object into an HTML document
929 tree. Note: this returns a tree, not an element. Use
930 ``parse(...).getroot()`` to get the document root.
932 You can override the base URL with the ``base_url`` keyword. This
933 is most useful when parsing from a file-like object.
934 """
935 if parser is None:
936 parser = html_parser
937 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940def _contains_block_level_tag(el):
941 # FIXME: I could do this with XPath, but would that just be
942 # unnecessarily slow?
943 for el in el.iter(etree.Element):
944 if _nons(el.tag) in defs.block_tags:
945 return True
946 return False
949def _element_name(el):
950 if isinstance(el, etree.CommentBase):
951 return 'comment'
952 elif isinstance(el, basestring):
953 return 'string'
954 else:
955 return _nons(el.tag)
958################################################################################
959# form handling
960################################################################################
962class FormElement(HtmlElement):
963 """
964 Represents a <form> element.
965 """
967 @property
968 def inputs(self):
969 """
970 Returns an accessor for all the input elements in the form.
972 See `InputGetter` for more information about the object.
973 """
974 return InputGetter(self)
976 @property
977 def fields(self):
978 """
979 Dictionary-like object that represents all the fields in this
980 form. You can set values in this dictionary to effect the
981 form.
982 """
983 return FieldsDict(self.inputs)
985 @fields.setter
986 def fields(self, value):
987 fields = self.fields
988 prev_keys = fields.keys()
989 for key, value in value.items():
990 if key in prev_keys:
991 prev_keys.remove(key)
992 fields[key] = value
993 for key in prev_keys:
994 if key is None:
995 # Case of an unnamed input; these aren't really
996 # expressed in form_values() anyway.
997 continue
998 fields[key] = None
1000 def _name(self):
1001 if self.get('name'):
1002 return self.get('name')
1003 elif self.get('id'):
1004 return '#' + self.get('id')
1005 iter_tags = self.body.iter
1006 forms = list(iter_tags('form'))
1007 if not forms:
1008 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
1009 return str(forms.index(self))
1011 def form_values(self):
1012 """
1013 Return a list of tuples of the field values for the form.
1014 This is suitable to be passed to ``urllib.urlencode()``.
1015 """
1016 results = []
1017 for el in self.inputs:
1018 name = el.name
1019 if not name or 'disabled' in el.attrib:
1020 continue
1021 tag = _nons(el.tag)
1022 if tag == 'textarea':
1023 results.append((name, el.value))
1024 elif tag == 'select':
1025 value = el.value
1026 if el.multiple:
1027 for v in value:
1028 results.append((name, v))
1029 elif value is not None:
1030 results.append((name, el.value))
1031 else:
1032 assert tag == 'input', (
1033 "Unexpected tag: %r" % el)
1034 if el.checkable and not el.checked:
1035 continue
1036 if el.type in ('submit', 'image', 'reset', 'file'):
1037 continue
1038 value = el.value
1039 if value is not None:
1040 results.append((name, el.value))
1041 return results
1043 @property
1044 def action(self):
1045 """
1046 Get/set the form's ``action`` attribute.
1047 """
1048 base_url = self.base_url
1049 action = self.get('action')
1050 if base_url and action is not None:
1051 return urljoin(base_url, action)
1052 else:
1053 return action
1055 @action.setter
1056 def action(self, value):
1057 self.set('action', value)
1059 @action.deleter
1060 def action(self):
1061 attrib = self.attrib
1062 if 'action' in attrib:
1063 del attrib['action']
1065 @property
1066 def method(self):
1067 """
1068 Get/set the form's method. Always returns a capitalized
1069 string, and defaults to ``'GET'``
1070 """
1071 return self.get('method', 'GET').upper()
1073 @method.setter
1074 def method(self, value):
1075 self.set('method', value.upper())
1078HtmlElementClassLookup._default_element_classes['form'] = FormElement
1081def submit_form(form, extra_values=None, open_http=None):
1082 """
1083 Helper function to submit a form. Returns a file-like object, as from
1084 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
1085 which shows the URL if there were any redirects.
1087 You can use this like::
1089 form = doc.forms[0]
1090 form.inputs['foo'].value = 'bar' # etc
1091 response = form.submit()
1092 doc = parse(response)
1093 doc.make_links_absolute(response.geturl())
1095 To change the HTTP requester, pass a function as ``open_http`` keyword
1096 argument that opens the URL for you. The function must have the following
1097 signature::
1099 open_http(method, URL, values)
1101 The action is one of 'GET' or 'POST', the URL is the target URL as a
1102 string, and the values are a sequence of ``(name, value)`` tuples with the
1103 form data.
1104 """
1105 values = form.form_values()
1106 if extra_values:
1107 if hasattr(extra_values, 'items'):
1108 extra_values = extra_values.items()
1109 values.extend(extra_values)
1110 if open_http is None:
1111 open_http = open_http_urllib
1112 if form.action:
1113 url = form.action
1114 else:
1115 url = form.base_url
1116 return open_http(form.method, url, values)
1119def open_http_urllib(method, url, values):
1120 if not url:
1121 raise ValueError("cannot submit, no URL provided")
1122 ## FIXME: should test that it's not a relative URL or something
1123 try:
1124 from urllib import urlencode, urlopen
1125 except ImportError: # Python 3
1126 from urllib.request import urlopen
1127 from urllib.parse import urlencode
1128 if method == 'GET':
1129 if '?' in url:
1130 url += '&'
1131 else:
1132 url += '?'
1133 url += urlencode(values)
1134 data = None
1135 else:
1136 data = urlencode(values)
1137 if not isinstance(data, bytes):
1138 data = data.encode('ASCII')
1139 return urlopen(url, data)
1142class FieldsDict(MutableMapping):
1144 def __init__(self, inputs):
1145 self.inputs = inputs
1146 def __getitem__(self, item):
1147 return self.inputs[item].value
1148 def __setitem__(self, item, value):
1149 self.inputs[item].value = value
1150 def __delitem__(self, item):
1151 raise KeyError(
1152 "You cannot remove keys from ElementDict")
1153 def keys(self):
1154 return self.inputs.keys()
1155 def __contains__(self, item):
1156 return item in self.inputs
1157 def __iter__(self):
1158 return iter(self.inputs.keys())
1159 def __len__(self):
1160 return len(self.inputs)
1162 def __repr__(self):
1163 return '<%s for form %s>' % (
1164 self.__class__.__name__,
1165 self.inputs.form._name())
1168class InputGetter(object):
1170 """
1171 An accessor that represents all the input fields in a form.
1173 You can get fields by name from this, with
1174 ``form.inputs['field_name']``. If there are a set of checkboxes
1175 with the same name, they are returned as a list (a `CheckboxGroup`
1176 which also allows value setting). Radio inputs are handled
1177 similarly. Use ``.keys()`` and ``.items()`` to process all fields
1178 in this way.
1180 You can also iterate over this to get all input elements. This
1181 won't return the same thing as if you get all the names, as
1182 checkboxes and radio elements are returned individually.
1183 """
1185 def __init__(self, form):
1186 self.form = form
1188 def __repr__(self):
1189 return '<%s for form %s>' % (
1190 self.__class__.__name__,
1191 self.form._name())
1193 ## FIXME: there should be more methods, and it's unclear if this is
1194 ## a dictionary-like object or list-like object
1196 def __getitem__(self, name):
1197 fields = [field for field in self if field.name == name]
1198 if not fields:
1199 raise KeyError("No input element with the name %r" % name)
1201 input_type = fields[0].get('type')
1202 if input_type == 'radio' and len(fields) > 1:
1203 group = RadioGroup(fields)
1204 group.name = name
1205 return group
1206 elif input_type == 'checkbox' and len(fields) > 1:
1207 group = CheckboxGroup(fields)
1208 group.name = name
1209 return group
1210 else:
1211 # I don't like throwing away elements like this
1212 return fields[0]
1214 def __contains__(self, name):
1215 for field in self:
1216 if field.name == name:
1217 return True
1218 return False
1220 def keys(self):
1221 """
1222 Returns all unique field names, in document order.
1224 :return: A list of all unique field names.
1225 """
1226 names = []
1227 seen = {None}
1228 for el in self:
1229 name = el.name
1230 if name not in seen:
1231 names.append(name)
1232 seen.add(name)
1233 return names
1235 def items(self):
1236 """
1237 Returns all fields with their names, similar to dict.items().
1239 :return: A list of (name, field) tuples.
1240 """
1241 items = []
1242 seen = set()
1243 for el in self:
1244 name = el.name
1245 if name not in seen:
1246 seen.add(name)
1247 items.append((name, self[name]))
1248 return items
1250 def __iter__(self):
1251 return self.form.iter('select', 'input', 'textarea')
1253 def __len__(self):
1254 return sum(1 for _ in self)
1257class InputMixin(object):
1258 """
1259 Mix-in for all input elements (input, select, and textarea)
1260 """
1261 @property
1262 def name(self):
1263 """
1264 Get/set the name of the element
1265 """
1266 return self.get('name')
1268 @name.setter
1269 def name(self, value):
1270 self.set('name', value)
1272 @name.deleter
1273 def name(self):
1274 attrib = self.attrib
1275 if 'name' in attrib:
1276 del attrib['name']
1278 def __repr__(self):
1279 type_name = getattr(self, 'type', None)
1280 if type_name:
1281 type_name = ' type=%r' % type_name
1282 else:
1283 type_name = ''
1284 return '<%s %x name=%r%s>' % (
1285 self.__class__.__name__, id(self), self.name, type_name)
1288class TextareaElement(InputMixin, HtmlElement):
1289 """
1290 ``<textarea>`` element. You can get the name with ``.name`` and
1291 get/set the value with ``.value``
1292 """
1293 @property
1294 def value(self):
1295 """
1296 Get/set the value (which is the contents of this element)
1297 """
1298 content = self.text or ''
1299 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1300 serialisation_method = 'xml'
1301 else:
1302 serialisation_method = 'html'
1303 for el in self:
1304 # it's rare that we actually get here, so let's not use ''.join()
1305 content += etree.tostring(
1306 el, method=serialisation_method, encoding='unicode')
1307 return content
1309 @value.setter
1310 def value(self, value):
1311 del self[:]
1312 self.text = value
1314 @value.deleter
1315 def value(self):
1316 self.text = ''
1317 del self[:]
1320HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1323class SelectElement(InputMixin, HtmlElement):
1324 """
1325 ``<select>`` element. You can get the name with ``.name``.
1327 ``.value`` will be the value of the selected option, unless this
1328 is a multi-select element (``<select multiple>``), in which case
1329 it will be a set-like object. In either case ``.value_options``
1330 gives the possible values.
1332 The boolean attribute ``.multiple`` shows if this is a
1333 multi-select.
1334 """
1335 @property
1336 def value(self):
1337 """
1338 Get/set the value of this select (the selected option).
1340 If this is a multi-select, this is a set-like object that
1341 represents all the selected options.
1342 """
1343 if self.multiple:
1344 return MultipleSelectOptions(self)
1345 options = _options_xpath(self)
1347 try:
1348 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1349 except StopIteration:
1350 try:
1351 selected_option = next(el for el in options if el.get('disabled') is None)
1352 except StopIteration:
1353 return None
1354 value = selected_option.get('value')
1355 if value is None:
1356 value = (selected_option.text or '').strip()
1357 return value
1359 @value.setter
1360 def value(self, value):
1361 if self.multiple:
1362 if isinstance(value, basestring):
1363 raise TypeError("You must pass in a sequence")
1364 values = self.value
1365 values.clear()
1366 values.update(value)
1367 return
1368 checked_option = None
1369 if value is not None:
1370 for el in _options_xpath(self):
1371 opt_value = el.get('value')
1372 if opt_value is None:
1373 opt_value = (el.text or '').strip()
1374 if opt_value == value:
1375 checked_option = el
1376 break
1377 else:
1378 raise ValueError(
1379 "There is no option with the value of %r" % value)
1380 for el in _options_xpath(self):
1381 if 'selected' in el.attrib:
1382 del el.attrib['selected']
1383 if checked_option is not None:
1384 checked_option.set('selected', '')
1386 @value.deleter
1387 def value(self):
1388 # FIXME: should del be allowed at all?
1389 if self.multiple:
1390 self.value.clear()
1391 else:
1392 self.value = None
1394 @property
1395 def value_options(self):
1396 """
1397 All the possible values this select can have (the ``value``
1398 attribute of all the ``<option>`` elements.
1399 """
1400 options = []
1401 for el in _options_xpath(self):
1402 value = el.get('value')
1403 if value is None:
1404 value = (el.text or '').strip()
1405 options.append(value)
1406 return options
1408 @property
1409 def multiple(self):
1410 """
1411 Boolean attribute: is there a ``multiple`` attribute on this element.
1412 """
1413 return 'multiple' in self.attrib
1415 @multiple.setter
1416 def multiple(self, value):
1417 if value:
1418 self.set('multiple', '')
1419 elif 'multiple' in self.attrib:
1420 del self.attrib['multiple']
1423HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1426class MultipleSelectOptions(SetMixin):
1427 """
1428 Represents all the selected options in a ``<select multiple>`` element.
1430 You can add to this set-like option to select an option, or remove
1431 to unselect the option.
1432 """
1434 def __init__(self, select):
1435 self.select = select
1437 @property
1438 def options(self):
1439 """
1440 Iterator of all the ``<option>`` elements.
1441 """
1442 return iter(_options_xpath(self.select))
1444 def __iter__(self):
1445 for option in self.options:
1446 if 'selected' in option.attrib:
1447 opt_value = option.get('value')
1448 if opt_value is None:
1449 opt_value = (option.text or '').strip()
1450 yield opt_value
1452 def add(self, item):
1453 for option in self.options:
1454 opt_value = option.get('value')
1455 if opt_value is None:
1456 opt_value = (option.text or '').strip()
1457 if opt_value == item:
1458 option.set('selected', '')
1459 break
1460 else:
1461 raise ValueError(
1462 "There is no option with the value %r" % item)
1464 def remove(self, item):
1465 for option in self.options:
1466 opt_value = option.get('value')
1467 if opt_value is None:
1468 opt_value = (option.text or '').strip()
1469 if opt_value == item:
1470 if 'selected' in option.attrib:
1471 del option.attrib['selected']
1472 else:
1473 raise ValueError(
1474 "The option %r is not currently selected" % item)
1475 break
1476 else:
1477 raise ValueError(
1478 "There is not option with the value %r" % item)
1480 def __repr__(self):
1481 return '<%s {%s} for select name=%r>' % (
1482 self.__class__.__name__,
1483 ', '.join([repr(v) for v in self]),
1484 self.select.name)
1487class RadioGroup(list):
1488 """
1489 This object represents several ``<input type=radio>`` elements
1490 that have the same name.
1492 You can use this like a list, but also use the property
1493 ``.value`` to check/uncheck inputs. Also you can use
1494 ``.value_options`` to get the possible values.
1495 """
1496 @property
1497 def value(self):
1498 """
1499 Get/set the value, which checks the radio with that value (and
1500 unchecks any other value).
1501 """
1502 for el in self:
1503 if 'checked' in el.attrib:
1504 return el.get('value')
1505 return None
1507 @value.setter
1508 def value(self, value):
1509 checked_option = None
1510 if value is not None:
1511 for el in self:
1512 if el.get('value') == value:
1513 checked_option = el
1514 break
1515 else:
1516 raise ValueError("There is no radio input with the value %r" % value)
1517 for el in self:
1518 if 'checked' in el.attrib:
1519 del el.attrib['checked']
1520 if checked_option is not None:
1521 checked_option.set('checked', '')
1523 @value.deleter
1524 def value(self):
1525 self.value = None
1527 @property
1528 def value_options(self):
1529 """
1530 Returns a list of all the possible values.
1531 """
1532 return [el.get('value') for el in self]
1534 def __repr__(self):
1535 return '%s(%s)' % (
1536 self.__class__.__name__,
1537 list.__repr__(self))
1540class CheckboxGroup(list):
1541 """
1542 Represents a group of checkboxes (``<input type=checkbox>``) that
1543 have the same name.
1545 In addition to using this like a list, the ``.value`` attribute
1546 returns a set-like object that you can add to or remove from to
1547 check and uncheck checkboxes. You can also use ``.value_options``
1548 to get the possible values.
1549 """
1550 @property
1551 def value(self):
1552 """
1553 Return a set-like object that can be modified to check or
1554 uncheck individual checkboxes according to their value.
1555 """
1556 return CheckboxValues(self)
1558 @value.setter
1559 def value(self, value):
1560 values = self.value
1561 values.clear()
1562 if not hasattr(value, '__iter__'):
1563 raise ValueError(
1564 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1565 % (self[0].name, value))
1566 values.update(value)
1568 @value.deleter
1569 def value(self):
1570 self.value.clear()
1572 @property
1573 def value_options(self):
1574 """
1575 Returns a list of all the possible values.
1576 """
1577 return [el.get('value') for el in self]
1579 def __repr__(self):
1580 return '%s(%s)' % (
1581 self.__class__.__name__, list.__repr__(self))
1584class CheckboxValues(SetMixin):
1585 """
1586 Represents the values of the checked checkboxes in a group of
1587 checkboxes with the same name.
1588 """
1590 def __init__(self, group):
1591 self.group = group
1593 def __iter__(self):
1594 return iter([
1595 el.get('value')
1596 for el in self.group
1597 if 'checked' in el.attrib])
1599 def add(self, value):
1600 for el in self.group:
1601 if el.get('value') == value:
1602 el.set('checked', '')
1603 break
1604 else:
1605 raise KeyError("No checkbox with value %r" % value)
1607 def remove(self, value):
1608 for el in self.group:
1609 if el.get('value') == value:
1610 if 'checked' in el.attrib:
1611 del el.attrib['checked']
1612 else:
1613 raise KeyError(
1614 "The checkbox with value %r was already unchecked" % value)
1615 break
1616 else:
1617 raise KeyError(
1618 "No checkbox with value %r" % value)
1620 def __repr__(self):
1621 return '<%s {%s} for checkboxes name=%r>' % (
1622 self.__class__.__name__,
1623 ', '.join([repr(v) for v in self]),
1624 self.group.name)
1627class InputElement(InputMixin, HtmlElement):
1628 """
1629 Represents an ``<input>`` element.
1631 You can get the type with ``.type`` (which is lower-cased and
1632 defaults to ``'text'``).
1634 Also you can get and set the value with ``.value``
1636 Checkboxes and radios have the attribute ``input.checkable ==
1637 True`` (for all others it is false) and a boolean attribute
1638 ``.checked``.
1640 """
1642 ## FIXME: I'm a little uncomfortable with the use of .checked
1643 @property
1644 def value(self):
1645 """
1646 Get/set the value of this element, using the ``value`` attribute.
1648 Also, if this is a checkbox and it has no value, this defaults
1649 to ``'on'``. If it is a checkbox or radio that is not
1650 checked, this returns None.
1651 """
1652 if self.checkable:
1653 if self.checked:
1654 return self.get('value') or 'on'
1655 else:
1656 return None
1657 return self.get('value')
1659 @value.setter
1660 def value(self, value):
1661 if self.checkable:
1662 if not value:
1663 self.checked = False
1664 else:
1665 self.checked = True
1666 if isinstance(value, basestring):
1667 self.set('value', value)
1668 else:
1669 self.set('value', value)
1671 @value.deleter
1672 def value(self):
1673 if self.checkable:
1674 self.checked = False
1675 else:
1676 if 'value' in self.attrib:
1677 del self.attrib['value']
1679 @property
1680 def type(self):
1681 """
1682 Return the type of this element (using the type attribute).
1683 """
1684 return self.get('type', 'text').lower()
1686 @type.setter
1687 def type(self, value):
1688 self.set('type', value)
1690 @property
1691 def checkable(self):
1692 """
1693 Boolean: can this element be checked?
1694 """
1695 return self.type in ('checkbox', 'radio')
1697 @property
1698 def checked(self):
1699 """
1700 Boolean attribute to get/set the presence of the ``checked``
1701 attribute.
1703 You can only use this on checkable input types.
1704 """
1705 if not self.checkable:
1706 raise AttributeError('Not a checkable input type')
1707 return 'checked' in self.attrib
1709 @checked.setter
1710 def checked(self, value):
1711 if not self.checkable:
1712 raise AttributeError('Not a checkable input type')
1713 if value:
1714 self.set('checked', '')
1715 else:
1716 attrib = self.attrib
1717 if 'checked' in attrib:
1718 del attrib['checked']
1721HtmlElementClassLookup._default_element_classes['input'] = InputElement
1724class LabelElement(HtmlElement):
1725 """
1726 Represents a ``<label>`` element.
1728 Label elements are linked to other elements with their ``for``
1729 attribute. You can access this element with ``label.for_element``.
1730 """
1731 @property
1732 def for_element(self):
1733 """
1734 Get/set the element this label points to. Return None if it
1735 can't be found.
1736 """
1737 id = self.get('for')
1738 if not id:
1739 return None
1740 return self.body.get_element_by_id(id)
1742 @for_element.setter
1743 def for_element(self, other):
1744 id = other.get('id')
1745 if not id:
1746 raise TypeError(
1747 "Element %r has no id attribute" % other)
1748 self.set('for', id)
1750 @for_element.deleter
1751 def for_element(self):
1752 attrib = self.attrib
1753 if 'id' in attrib:
1754 del attrib['id']
1757HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1760############################################################
1761## Serialization
1762############################################################
1764def html_to_xhtml(html):
1765 """Convert all tags in an HTML tree to XHTML by moving them to the
1766 XHTML namespace.
1767 """
1768 try:
1769 html = html.getroot()
1770 except AttributeError:
1771 pass
1772 prefix = "{%s}" % XHTML_NAMESPACE
1773 for el in html.iter(etree.Element):
1774 tag = el.tag
1775 if tag[0] != '{':
1776 el.tag = prefix + tag
1779def xhtml_to_html(xhtml):
1780 """Convert all tags in an XHTML tree to HTML by removing their
1781 XHTML namespace.
1782 """
1783 try:
1784 xhtml = xhtml.getroot()
1785 except AttributeError:
1786 pass
1787 prefix = "{%s}" % XHTML_NAMESPACE
1788 prefix_len = len(prefix)
1789 for el in xhtml.iter(prefix + "*"):
1790 el.tag = el.tag[prefix_len:]
1793# This isn't a general match, but it's a match for what libxml2
1794# specifically serialises:
1795__str_replace_meta_content_type = re.compile(
1796 r'<meta http-equiv="Content-Type"[^>]*>').sub
1797__bytes_replace_meta_content_type = re.compile(
1798 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1801def tostring(doc, pretty_print=False, include_meta_content_type=False,
1802 encoding=None, method="html", with_tail=True, doctype=None):
1803 """Return an HTML string representation of the document.
1805 Note: if include_meta_content_type is true this will create a
1806 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1807 regardless of the value of include_meta_content_type any existing
1808 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1810 The ``encoding`` argument controls the output encoding (defaults to
1811 ASCII, with &#...; character references for any characters outside
1812 of ASCII). Note that you can pass the name ``'unicode'`` as
1813 ``encoding`` argument to serialise to a Unicode string.
1815 The ``method`` argument defines the output method. It defaults to
1816 'html', but can also be 'xml' for xhtml output, or 'text' to
1817 serialise to plain text without markup.
1819 To leave out the tail text of the top-level element that is being
1820 serialised, pass ``with_tail=False``.
1822 The ``doctype`` option allows passing in a plain string that will
1823 be serialised before the XML tree. Note that passing in non
1824 well-formed content here will make the XML output non well-formed.
1825 Also, an existing doctype in the document tree will not be removed
1826 when serialising an ElementTree instance.
1828 Example::
1830 >>> from lxml import html
1831 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1833 >>> html.tostring(root)
1834 b'<p>Hello<br>world!</p>'
1835 >>> html.tostring(root, method='html')
1836 b'<p>Hello<br>world!</p>'
1838 >>> html.tostring(root, method='xml')
1839 b'<p>Hello<br/>world!</p>'
1841 >>> html.tostring(root, method='text')
1842 b'Helloworld!'
1844 >>> html.tostring(root, method='text', encoding='unicode')
1845 u'Helloworld!'
1847 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1848 >>> html.tostring(root[0], method='text', encoding='unicode')
1849 u'Helloworld!TAIL'
1851 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1852 u'Helloworld!'
1854 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1855 >>> html.tostring(doc, method='html', encoding='unicode')
1856 u'<html><body><p>Hello<br>world!</p></body></html>'
1858 >>> print(html.tostring(doc, method='html', encoding='unicode',
1859 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1860 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1861 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1862 <html><body><p>Hello<br>world!</p></body></html>
1863 """
1864 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1865 encoding=encoding, with_tail=with_tail,
1866 doctype=doctype)
1867 if method == 'html' and not include_meta_content_type:
1868 if isinstance(html, str):
1869 html = __str_replace_meta_content_type('', html)
1870 else:
1871 html = __bytes_replace_meta_content_type(bytes(), html)
1872 return html
1875tostring.__doc__ = __fix_docstring(tostring.__doc__)
1878def open_in_browser(doc, encoding=None):
1879 """
1880 Open the HTML document in a web browser, saving it to a temporary
1881 file to open it. Note that this does not delete the file after
1882 use. This is mainly meant for debugging.
1883 """
1884 import os
1885 import webbrowser
1886 import tempfile
1887 if not isinstance(doc, etree._ElementTree):
1888 doc = etree.ElementTree(doc)
1889 handle, fn = tempfile.mkstemp(suffix='.html')
1890 f = os.fdopen(handle, 'wb')
1891 try:
1892 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1893 finally:
1894 # we leak the file itself here, but we should at least close it
1895 f.close()
1896 url = 'file://' + fn.replace(os.path.sep, '/')
1897 print(url)
1898 webbrowser.open(url)
1901################################################################################
1902# configure Element class lookup
1903################################################################################
1905class HTMLParser(etree.HTMLParser):
1906 """An HTML parser that is configured to return lxml.html Element
1907 objects.
1908 """
1909 def __init__(self, **kwargs):
1910 super(HTMLParser, self).__init__(**kwargs)
1911 self.set_element_class_lookup(HtmlElementClassLookup())
1914class XHTMLParser(etree.XMLParser):
1915 """An XML parser that is configured to return lxml.html Element
1916 objects.
1918 Note that this parser is not really XHTML aware unless you let it
1919 load a DTD that declares the HTML entities. To do this, make sure
1920 you have the XHTML DTDs installed in your catalogs, and create the
1921 parser like this::
1923 >>> parser = XHTMLParser(load_dtd=True)
1925 If you additionally want to validate the document, use this::
1927 >>> parser = XHTMLParser(dtd_validation=True)
1929 For catalog support, see http://www.xmlsoft.org/catalog.html.
1930 """
1931 def __init__(self, **kwargs):
1932 super(XHTMLParser, self).__init__(**kwargs)
1933 self.set_element_class_lookup(HtmlElementClassLookup())
1936def Element(*args, **kw):
1937 """Create a new HTML Element.
1939 This can also be used for XHTML documents.
1940 """
1941 v = html_parser.makeelement(*args, **kw)
1942 return v
1945html_parser = HTMLParser()
1946xhtml_parser = XHTMLParser()