1# Copyright (c) 2004 Ian Bicking. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# 1. Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# 2. Redistributions in binary form must reproduce the above copyright
11# notice, this list of conditions and the following disclaimer in
12# the documentation and/or other materials provided with the
13# distribution.
14#
15# 3. Neither the name of Ian Bicking nor the names of its contributors may
16# be used to endorse or promote products derived from this software
17# without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""The ``lxml.html`` tool set for HTML handling.
32"""
33
34
35__all__ = [
36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
38 'find_rel_links', 'find_class', 'make_links_absolute',
39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
40
41
42import copy
43import re
44
45from collections.abc import MutableMapping, MutableSet
46from functools import partial
47from urllib.parse import urljoin
48
49from .. import etree
50from . import defs
51from ._setmixin import SetMixin
52
53
54def __fix_docstring(s):
55 # TODO: remove and clean up doctests
56 if not s:
57 return s
58 sub = re.compile(r"^(\s*)u'", re.M).sub
59 return sub(r"\1'", s)
60
61
62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
63
64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
65 namespaces={'x':XHTML_NAMESPACE})
66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
67 namespaces={'x':XHTML_NAMESPACE})
68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
69 namespaces={'x':XHTML_NAMESPACE})
70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
73_collect_string_content = etree.XPath("string()")
74_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
77 namespaces={'x':XHTML_NAMESPACE})
78_archive_re = re.compile(r'[^ ]+')
79_parse_meta_refresh_url = re.compile(
80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
81
82
83def _unquote_match(s, pos):
84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
85 return s[1:-1], pos+1
86 else:
87 return s,pos
88
89
90def _transform_result(typ, result):
91 """Convert the result back into the input type.
92 """
93 if issubclass(typ, bytes):
94 return tostring(result, encoding='utf-8')
95 elif issubclass(typ, str):
96 return tostring(result, encoding='unicode')
97 else:
98 return result
99
100
101def _nons(tag):
102 if isinstance(tag, str):
103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
104 return tag.split('}')[-1]
105 return tag
106
107
108class Classes(MutableSet):
109 """Provides access to an element's class attribute as a set-like collection.
110 Usage::
111
112 >>> el = fromstring('<p class="hidden large">Text</p>')
113 >>> classes = el.classes # or: classes = Classes(el.attrib)
114 >>> classes |= ['block', 'paragraph']
115 >>> el.get('class')
116 'hidden large block paragraph'
117 >>> classes.toggle('hidden')
118 False
119 >>> el.get('class')
120 'large block paragraph'
121 >>> classes -= ('some', 'classes', 'block')
122 >>> el.get('class')
123 'large paragraph'
124 """
125 def __init__(self, attributes):
126 self._attributes = attributes
127 self._get_class_value = partial(attributes.get, 'class', '')
128
129 def add(self, value):
130 """
131 Add a class.
132
133 This has no effect if the class is already present.
134 """
135 if not value or re.search(r'\s', value):
136 raise ValueError("Invalid class name: %r" % value)
137 classes = self._get_class_value().split()
138 if value in classes:
139 return
140 classes.append(value)
141 self._attributes['class'] = ' '.join(classes)
142
143 def discard(self, value):
144 """
145 Remove a class if it is currently present.
146
147 If the class is not present, do nothing.
148 """
149 if not value or re.search(r'\s', value):
150 raise ValueError("Invalid class name: %r" % value)
151 classes = [name for name in self._get_class_value().split()
152 if name != value]
153 if classes:
154 self._attributes['class'] = ' '.join(classes)
155 elif 'class' in self._attributes:
156 del self._attributes['class']
157
158 def remove(self, value):
159 """
160 Remove a class; it must currently be present.
161
162 If the class is not present, raise a KeyError.
163 """
164 if not value or re.search(r'\s', value):
165 raise ValueError("Invalid class name: %r" % value)
166 super().remove(value)
167
168 def __contains__(self, name):
169 classes = self._get_class_value()
170 return name in classes and name in classes.split()
171
172 def __iter__(self):
173 return iter(self._get_class_value().split())
174
175 def __len__(self):
176 return len(self._get_class_value().split())
177
178 # non-standard methods
179
180 def update(self, values):
181 """
182 Add all names from 'values'.
183 """
184 classes = self._get_class_value().split()
185 extended = False
186 for value in values:
187 if value not in classes:
188 classes.append(value)
189 extended = True
190 if extended:
191 self._attributes['class'] = ' '.join(classes)
192
193 def toggle(self, value):
194 """
195 Add a class name if it isn't there yet, or remove it if it exists.
196
197 Returns true if the class was added (and is now enabled) and
198 false if it was removed (and is now disabled).
199 """
200 if not value or re.search(r'\s', value):
201 raise ValueError("Invalid class name: %r" % value)
202 classes = self._get_class_value().split()
203 try:
204 classes.remove(value)
205 enabled = False
206 except ValueError:
207 classes.append(value)
208 enabled = True
209 if classes:
210 self._attributes['class'] = ' '.join(classes)
211 else:
212 del self._attributes['class']
213 return enabled
214
215
216class HtmlMixin:
217
218 def set(self, key, value=None):
219 """set(self, key, value=None)
220
221 Sets an element attribute. If no value is provided, or if the value is None,
222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
223 for ``form.set('novalidate')``.
224 """
225 super().set(key, value)
226
227 @property
228 def classes(self):
229 """
230 A set-like wrapper around the 'class' attribute.
231 """
232 return Classes(self.attrib)
233
234 @classes.setter
235 def classes(self, classes):
236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
237 value = classes._get_class_value()
238 if value:
239 self.set('class', value)
240 elif self.get('class') is not None:
241 del self.attrib['class']
242
243 @property
244 def base_url(self):
245 """
246 Returns the base URL, given when the page was parsed.
247
248 Use with ``urlparse.urljoin(el.base_url, href)`` to get
249 absolute URLs.
250 """
251 return self.getroottree().docinfo.URL
252
253 @property
254 def forms(self):
255 """
256 Return a list of all the forms
257 """
258 return _forms_xpath(self)
259
260 @property
261 def body(self):
262 """
263 Return the <body> element. Can be called from a child element
264 to get the document's head.
265 """
266 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
267
268 @property
269 def head(self):
270 """
271 Returns the <head> element. Can be called from a child
272 element to get the document's head.
273 """
274 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
275
276 @property
277 def label(self):
278 """
279 Get or set any <label> element associated with this element.
280 """
281 id = self.get('id')
282 if not id:
283 return None
284 result = _label_xpath(self, id=id)
285 if not result:
286 return None
287 else:
288 return result[0]
289
290 @label.setter
291 def label(self, label):
292 id = self.get('id')
293 if not id:
294 raise TypeError(
295 "You cannot set a label for an element (%r) that has no id"
296 % self)
297 if _nons(label.tag) != 'label':
298 raise TypeError(
299 "You can only assign label to a label element (not %r)"
300 % label)
301 label.set('for', id)
302
303 @label.deleter
304 def label(self):
305 label = self.label
306 if label is not None:
307 del label.attrib['for']
308
309 def drop_tree(self):
310 """
311 Removes this element from the tree, including its children and
312 text. The tail text is joined to the previous element or
313 parent.
314 """
315 parent = self.getparent()
316 assert parent is not None
317 if self.tail:
318 previous = self.getprevious()
319 if previous is None:
320 parent.text = (parent.text or '') + self.tail
321 else:
322 previous.tail = (previous.tail or '') + self.tail
323 parent.remove(self)
324
325 def drop_tag(self):
326 """
327 Remove the tag, but not its children or text. The children and text
328 are merged into the parent.
329
330 Example::
331
332 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
333 >>> h.find('.//b').drop_tag()
334 >>> print(tostring(h, encoding='unicode'))
335 <div>Hello World!</div>
336 """
337 parent = self.getparent()
338 assert parent is not None
339 previous = self.getprevious()
340 if self.text and isinstance(self.tag, str):
341 # not a Comment, etc.
342 if previous is None:
343 parent.text = (parent.text or '') + self.text
344 else:
345 previous.tail = (previous.tail or '') + self.text
346 if self.tail:
347 if len(self):
348 last = self[-1]
349 last.tail = (last.tail or '') + self.tail
350 elif previous is None:
351 parent.text = (parent.text or '') + self.tail
352 else:
353 previous.tail = (previous.tail or '') + self.tail
354 index = parent.index(self)
355 parent[index:index+1] = self[:]
356
357 def find_rel_links(self, rel):
358 """
359 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
360 """
361 rel = rel.lower()
362 return [el for el in _rel_links_xpath(self)
363 if el.get('rel').lower() == rel]
364
365 def find_class(self, class_name):
366 """
367 Find any elements with the given class name.
368 """
369 return _class_xpath(self, class_name=class_name)
370
371 def get_element_by_id(self, id, *default):
372 """
373 Get the first element in a document with the given id. If none is
374 found, return the default argument if provided or raise KeyError
375 otherwise.
376
377 Note that there can be more than one element with the same id,
378 and this isn't uncommon in HTML documents found in the wild.
379 Browsers return only the first match, and this function does
380 the same.
381 """
382 try:
383 # FIXME: should this check for multiple matches?
384 # browsers just return the first one
385 return _id_xpath(self, id=id)[0]
386 except IndexError:
387 if default:
388 return default[0]
389 else:
390 raise KeyError(id)
391
392 def text_content(self):
393 """
394 Return the text content of the tag (and the text in any children).
395 """
396 return _collect_string_content(self)
397
398 def cssselect(self, expr, translator='html'):
399 """
400 Run the CSS expression on this element and its children,
401 returning a list of the results.
402
403 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
404 -- note that pre-compiling the expression can provide a substantial
405 speedup.
406 """
407 # Do the import here to make the dependency optional.
408 from lxml.cssselect import CSSSelector
409 return CSSSelector(expr, translator=translator)(self)
410
411 ########################################
412 ## Link functions
413 ########################################
414
415 def make_links_absolute(self, base_url=None, resolve_base_href=True,
416 handle_failures=None):
417 """
418 Make all links in the document absolute, given the
419 ``base_url`` for the document (the full URL where the document
420 came from), or if no ``base_url`` is given, then the ``.base_url``
421 of the document.
422
423 If ``resolve_base_href`` is true, then any ``<base href>``
424 tags in the document are used *and* removed from the document.
425 If it is false then any such tag is ignored.
426
427 If ``handle_failures`` is None (default), a failure to process
428 a URL will abort the processing. If set to 'ignore', errors
429 are ignored. If set to 'discard', failing URLs will be removed.
430 """
431 if base_url is None:
432 base_url = self.base_url
433 if base_url is None:
434 raise TypeError(
435 "No base_url given, and the document has no base_url")
436 if resolve_base_href:
437 self.resolve_base_href()
438
439 if handle_failures == 'ignore':
440 def link_repl(href):
441 try:
442 return urljoin(base_url, href)
443 except ValueError:
444 return href
445 elif handle_failures == 'discard':
446 def link_repl(href):
447 try:
448 return urljoin(base_url, href)
449 except ValueError:
450 return None
451 elif handle_failures is None:
452 def link_repl(href):
453 return urljoin(base_url, href)
454 else:
455 raise ValueError(
456 "unexpected value for handle_failures: %r" % handle_failures)
457
458 self.rewrite_links(link_repl)
459
460 def resolve_base_href(self, handle_failures=None):
461 """
462 Find any ``<base href>`` tag in the document, and apply its
463 values to all links found in the document. Also remove the
464 tag once it has been applied.
465
466 If ``handle_failures`` is None (default), a failure to process
467 a URL will abort the processing. If set to 'ignore', errors
468 are ignored. If set to 'discard', failing URLs will be removed.
469 """
470 base_href = None
471 basetags = self.xpath('//base[@href]|//x:base[@href]',
472 namespaces={'x': XHTML_NAMESPACE})
473 for b in basetags:
474 base_href = b.get('href')
475 b.drop_tree()
476 if not base_href:
477 return
478 self.make_links_absolute(base_href, resolve_base_href=False,
479 handle_failures=handle_failures)
480
481 def iterlinks(self):
482 """
483 Yield (element, attribute, link, pos), where attribute may be None
484 (indicating the link is in the text). ``pos`` is the position
485 where the link occurs; often 0, but sometimes something else in
486 the case of links in stylesheets or style tags.
487
488 Note: <base href> is *not* taken into account in any way. The
489 link you get is exactly the link in the document.
490
491 Note: multiple links inside of a single text string or
492 attribute value are returned in reversed order. This makes it
493 possible to replace or delete them from the text string value
494 based on their reported text positions. Otherwise, a
495 modification at one text position can change the positions of
496 links reported later on.
497 """
498 link_attrs = defs.link_attrs
499 for el in self.iter(etree.Element):
500 attribs = el.attrib
501 tag = _nons(el.tag)
502 if tag == 'object':
503 codebase = None
504 ## <object> tags have attributes that are relative to
505 ## codebase
506 if 'codebase' in attribs:
507 codebase = el.get('codebase')
508 yield (el, 'codebase', codebase, 0)
509 for attrib in ('classid', 'data'):
510 if attrib in attribs:
511 value = el.get(attrib)
512 if codebase is not None:
513 value = urljoin(codebase, value)
514 yield (el, attrib, value, 0)
515 if 'archive' in attribs:
516 for match in _archive_re.finditer(el.get('archive')):
517 value = match.group(0)
518 if codebase is not None:
519 value = urljoin(codebase, value)
520 yield (el, 'archive', value, match.start())
521 else:
522 for attrib in link_attrs:
523 if attrib in attribs:
524 yield (el, attrib, attribs[attrib], 0)
525 if tag == 'meta':
526 http_equiv = attribs.get('http-equiv', '').lower()
527 if http_equiv == 'refresh':
528 content = attribs.get('content', '')
529 match = _parse_meta_refresh_url(content)
530 url = (match.group('url') if match else content).strip()
531 # unexpected content means the redirect won't work, but we might
532 # as well be permissive and return the entire string.
533 if url:
534 url, pos = _unquote_match(
535 url, match.start('url') if match else content.find(url))
536 yield (el, 'content', url, pos)
537 elif tag == 'param':
538 valuetype = el.get('valuetype') or ''
539 if valuetype.lower() == 'ref':
540 ## FIXME: while it's fine we *find* this link,
541 ## according to the spec we aren't supposed to
542 ## actually change the value, including resolving
543 ## it. It can also still be a link, even if it
544 ## doesn't have a valuetype="ref" (which seems to be the norm)
545 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
546 yield (el, 'value', el.get('value'), 0)
547 elif tag == 'style' and el.text:
548 urls = [
549 # (start_pos, url)
550 _unquote_match(match.group(1), match.start(1))[::-1]
551 for match in _iter_css_urls(el.text)
552 ] + [
553 (match.start(1), match.group(1))
554 for match in _iter_css_imports(el.text)
555 ]
556 if urls:
557 # sort by start pos to bring both match sets back into order
558 # and reverse the list to report correct positions despite
559 # modifications
560 urls.sort(reverse=True)
561 for start, url in urls:
562 yield (el, None, url, start)
563 if 'style' in attribs:
564 urls = list(_iter_css_urls(attribs['style']))
565 if urls:
566 # return in reversed order to simplify in-place modifications
567 for match in urls[::-1]:
568 url, start = _unquote_match(match.group(1), match.start(1))
569 yield (el, 'style', url, start)
570
571 def rewrite_links(self, link_repl_func, resolve_base_href=True,
572 base_href=None):
573 """
574 Rewrite all the links in the document. For each link
575 ``link_repl_func(link)`` will be called, and the return value
576 will replace the old link.
577
578 Note that links may not be absolute (unless you first called
579 ``make_links_absolute()``), and may be internal (e.g.,
580 ``'#anchor'``). They can also be values like
581 ``'mailto:email'`` or ``'javascript:expr'``.
582
583 If you give ``base_href`` then all links passed to
584 ``link_repl_func()`` will take that into account.
585
586 If the ``link_repl_func`` returns None, the attribute or
587 tag text will be removed completely.
588 """
589 if base_href is not None:
590 # FIXME: this can be done in one pass with a wrapper
591 # around link_repl_func
592 self.make_links_absolute(
593 base_href, resolve_base_href=resolve_base_href)
594 elif resolve_base_href:
595 self.resolve_base_href()
596
597 for el, attrib, link, pos in self.iterlinks():
598 new_link = link_repl_func(link.strip())
599 if new_link == link:
600 continue
601 if new_link is None:
602 # Remove the attribute or element content
603 if attrib is None:
604 el.text = ''
605 else:
606 del el.attrib[attrib]
607 continue
608
609 if attrib is None:
610 new = el.text[:pos] + new_link + el.text[pos+len(link):]
611 el.text = new
612 else:
613 cur = el.get(attrib)
614 if not pos and len(cur) == len(link):
615 new = new_link # most common case
616 else:
617 new = cur[:pos] + new_link + cur[pos+len(link):]
618 el.set(attrib, new)
619
620
621class _MethodFunc:
622 """
623 An object that represents a method on an element as a function;
624 the function takes either an element or an HTML string. It
625 returns whatever the function normally returns, or if the function
626 works in-place (and so returns None) it returns a serialized form
627 of the resulting document.
628 """
629 def __init__(self, name, copy=False, source_class=HtmlMixin):
630 self.name = name
631 self.copy = copy
632 self.__doc__ = getattr(source_class, self.name).__doc__
633 def __call__(self, doc, *args, **kw):
634 result_type = type(doc)
635 if isinstance(doc, (str, bytes)):
636 if 'copy' in kw:
637 raise TypeError(
638 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
639 doc = fromstring(doc, **kw)
640 else:
641 if 'copy' in kw:
642 make_a_copy = kw.pop('copy')
643 else:
644 make_a_copy = self.copy
645 if make_a_copy:
646 doc = copy.deepcopy(doc)
647 meth = getattr(doc, self.name)
648 result = meth(*args, **kw)
649 # FIXME: this None test is a bit sloppy
650 if result is None:
651 # Then return what we got in
652 return _transform_result(result_type, doc)
653 else:
654 return result
655
656
657find_rel_links = _MethodFunc('find_rel_links', copy=False)
658find_class = _MethodFunc('find_class', copy=False)
659make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
660resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
661iterlinks = _MethodFunc('iterlinks', copy=False)
662rewrite_links = _MethodFunc('rewrite_links', copy=True)
663
664
665class HtmlComment(HtmlMixin, etree.CommentBase):
666 pass
667
668
669class HtmlElement(HtmlMixin, etree.ElementBase):
670 pass
671
672
673class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):
674 pass
675
676
677class HtmlEntity(HtmlMixin, etree.EntityBase):
678 pass
679
680
681class HtmlElementClassLookup(etree.CustomElementClassLookup):
682 """A lookup scheme for HTML Element classes.
683
684 To create a lookup instance with different Element classes, pass a tag
685 name mapping of Element classes in the ``classes`` keyword argument and/or
686 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
687 The special key '*' denotes a Mixin class that should be mixed into all
688 Element classes.
689 """
690 _default_element_classes = {}
691
692 def __init__(self, classes=None, mixins=None):
693 etree.CustomElementClassLookup.__init__(self)
694 if classes is None:
695 classes = self._default_element_classes.copy()
696 if mixins:
697 mixers = {}
698 for name, value in mixins:
699 if name == '*':
700 for n in classes.keys():
701 mixers.setdefault(n, []).append(value)
702 else:
703 mixers.setdefault(name, []).append(value)
704 for name, mix_bases in mixers.items():
705 cur = classes.get(name, HtmlElement)
706 bases = tuple(mix_bases + [cur])
707 classes[name] = type(cur.__name__, bases, {})
708 self._element_classes = classes
709
710 def lookup(self, node_type, document, namespace, name):
711 if node_type == 'element':
712 return self._element_classes.get(name.lower(), HtmlElement)
713 elif node_type == 'comment':
714 return HtmlComment
715 elif node_type == 'PI':
716 return HtmlProcessingInstruction
717 elif node_type == 'entity':
718 return HtmlEntity
719 # Otherwise normal lookup
720 return None
721
722
723################################################################################
724# parsing
725################################################################################
726
727_looks_like_full_html_unicode = re.compile(
728 r'^\s*<(?:html|!doctype)', re.I).match
729_looks_like_full_html_bytes = re.compile(
730 br'^\s*<(?:html|!doctype)', re.I).match
731
732
733def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
734 if parser is None:
735 parser = html_parser
736 value = etree.fromstring(html, parser, **kw)
737 if value is None:
738 raise etree.ParserError(
739 "Document is empty")
740 if ensure_head_body and value.find('head') is None:
741 value.insert(0, Element('head'))
742 if ensure_head_body and value.find('body') is None:
743 value.append(Element('body'))
744 return value
745
746
747def fragments_fromstring(html, no_leading_text=False, base_url=None,
748 parser=None, **kw):
749 """Parses several HTML elements, returning a list of elements.
750
751 The first item in the list may be a string.
752 If no_leading_text is true, then it will be an error if there is
753 leading text, and it will always be a list of only elements.
754
755 base_url will set the document's base_url attribute
756 (and the tree's docinfo.URL).
757 """
758 if parser is None:
759 parser = html_parser
760 # FIXME: check what happens when you give html with a body, head, etc.
761 if isinstance(html, bytes):
762 if not _looks_like_full_html_bytes(html):
763 # can't use %-formatting in early Py3 versions
764 html = (b'<html><body>' + html +
765 b'</body></html>')
766 else:
767 if not _looks_like_full_html_unicode(html):
768 html = '<html><body>%s</body></html>' % html
769 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
770 assert _nons(doc.tag) == 'html'
771 bodies = [e for e in doc if _nons(e.tag) == 'body']
772 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
773 body = bodies[0]
774 elements = []
775 if no_leading_text and body.text and body.text.strip():
776 raise etree.ParserError(
777 "There is leading text: %r" % body.text)
778 if body.text and body.text.strip():
779 elements.append(body.text)
780 elements.extend(body)
781 # FIXME: removing the reference to the parent artificial document
782 # would be nice
783 return elements
784
785
786def fragment_fromstring(html, create_parent=False, base_url=None,
787 parser=None, **kw):
788 """
789 Parses a single HTML element; it is an error if there is more than
790 one element, or if anything but whitespace precedes or follows the
791 element.
792
793 If ``create_parent`` is true (or is a tag name) then a parent node
794 will be created to encapsulate the HTML in a single element. In this
795 case, leading or trailing text is also allowed, as are multiple elements
796 as result of the parsing.
797
798 Passing a ``base_url`` will set the document's ``base_url`` attribute
799 (and the tree's docinfo.URL).
800 """
801 if parser is None:
802 parser = html_parser
803
804 accept_leading_text = bool(create_parent)
805
806 elements = fragments_fromstring(
807 html, parser=parser, no_leading_text=not accept_leading_text,
808 base_url=base_url, **kw)
809
810 if create_parent:
811 if not isinstance(create_parent, str):
812 create_parent = 'div'
813 new_root = Element(create_parent)
814 if elements:
815 if isinstance(elements[0], str):
816 new_root.text = elements[0]
817 del elements[0]
818 new_root.extend(elements)
819 return new_root
820
821 if not elements:
822 raise etree.ParserError('No elements found')
823 if len(elements) > 1:
824 raise etree.ParserError(
825 "Multiple elements found (%s)"
826 % ', '.join([_element_name(e) for e in elements]))
827 el = elements[0]
828 if el.tail and el.tail.strip():
829 raise etree.ParserError(
830 "Element followed by text: %r" % el.tail)
831 el.tail = None
832 return el
833
834
835def fromstring(html, base_url=None, parser=None, **kw):
836 """
837 Parse the html, returning a single element/document.
838
839 This tries to minimally parse the chunk of text, without knowing if it
840 is a fragment or a document.
841
842 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
843 """
844 if parser is None:
845 parser = html_parser
846 if isinstance(html, bytes):
847 is_full_html = _looks_like_full_html_bytes(html)
848 else:
849 is_full_html = _looks_like_full_html_unicode(html)
850 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
851 if is_full_html:
852 return doc
853 # otherwise, lets parse it out...
854 bodies = doc.findall('body')
855 if not bodies:
856 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
857 if bodies:
858 body = bodies[0]
859 if len(bodies) > 1:
860 # Somehow there are multiple bodies, which is bad, but just
861 # smash them into one body
862 for other_body in bodies[1:]:
863 if other_body.text:
864 if len(body):
865 body[-1].tail = (body[-1].tail or '') + other_body.text
866 else:
867 body.text = (body.text or '') + other_body.text
868 body.extend(other_body)
869 # We'll ignore tail
870 # I guess we are ignoring attributes too
871 other_body.drop_tree()
872 else:
873 body = None
874 heads = doc.findall('head')
875 if not heads:
876 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
877 if heads:
878 # Well, we have some sort of structure, so lets keep it all
879 head = heads[0]
880 if len(heads) > 1:
881 for other_head in heads[1:]:
882 head.extend(other_head)
883 # We don't care about text or tail in a head
884 other_head.drop_tree()
885 return doc
886 if body is None:
887 return doc
888 if (len(body) == 1 and (not body.text or not body.text.strip())
889 and (not body[-1].tail or not body[-1].tail.strip())):
890 # The body has just one element, so it was probably a single
891 # element passed in
892 return body[0]
893 # Now we have a body which represents a bunch of tags which have the
894 # content that was passed in. We will create a fake container, which
895 # is the body tag, except <body> implies too much structure.
896 if _contains_block_level_tag(body):
897 body.tag = 'div'
898 else:
899 body.tag = 'span'
900 return body
901
902
903def parse(filename_or_url, parser=None, base_url=None, **kw):
904 """
905 Parse a filename, URL, or file-like object into an HTML document
906 tree. Note: this returns a tree, not an element. Use
907 ``parse(...).getroot()`` to get the document root.
908
909 You can override the base URL with the ``base_url`` keyword. This
910 is most useful when parsing from a file-like object.
911 """
912 if parser is None:
913 parser = html_parser
914 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
915
916
917def _contains_block_level_tag(el):
918 # FIXME: I could do this with XPath, but would that just be
919 # unnecessarily slow?
920 for el in el.iter(etree.Element):
921 if _nons(el.tag) in defs.block_tags:
922 return True
923 return False
924
925
926def _element_name(el):
927 if isinstance(el, etree.CommentBase):
928 return 'comment'
929 elif isinstance(el, str):
930 return 'string'
931 else:
932 return _nons(el.tag)
933
934
935################################################################################
936# form handling
937################################################################################
938
939class FormElement(HtmlElement):
940 """
941 Represents a <form> element.
942 """
943
944 @property
945 def inputs(self):
946 """
947 Returns an accessor for all the input elements in the form.
948
949 See `InputGetter` for more information about the object.
950 """
951 return InputGetter(self)
952
953 @property
954 def fields(self):
955 """
956 Dictionary-like object that represents all the fields in this
957 form. You can set values in this dictionary to effect the
958 form.
959 """
960 return FieldsDict(self.inputs)
961
962 @fields.setter
963 def fields(self, value):
964 fields = self.fields
965 prev_keys = fields.keys()
966 for key, value in value.items():
967 if key in prev_keys:
968 prev_keys.remove(key)
969 fields[key] = value
970 for key in prev_keys:
971 if key is None:
972 # Case of an unnamed input; these aren't really
973 # expressed in form_values() anyway.
974 continue
975 fields[key] = None
976
977 def _name(self):
978 if self.get('name'):
979 return self.get('name')
980 elif self.get('id'):
981 return '#' + self.get('id')
982 iter_tags = self.body.iter
983 forms = list(iter_tags('form'))
984 if not forms:
985 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
986 return str(forms.index(self))
987
988 def form_values(self):
989 """
990 Return a list of tuples of the field values for the form.
991 This is suitable to be passed to ``urllib.urlencode()``.
992 """
993 results = []
994 for el in self.inputs:
995 name = el.name
996 if not name or 'disabled' in el.attrib:
997 continue
998 tag = _nons(el.tag)
999 if tag == 'textarea':
1000 results.append((name, el.value))
1001 elif tag == 'select':
1002 value = el.value
1003 if el.multiple:
1004 for v in value:
1005 results.append((name, v))
1006 elif value is not None:
1007 results.append((name, el.value))
1008 else:
1009 assert tag == 'input', (
1010 "Unexpected tag: %r" % el)
1011 if el.checkable and not el.checked:
1012 continue
1013 if el.type in ('submit', 'image', 'reset', 'file'):
1014 continue
1015 value = el.value
1016 if value is not None:
1017 results.append((name, el.value))
1018 return results
1019
1020 @property
1021 def action(self):
1022 """
1023 Get/set the form's ``action`` attribute.
1024 """
1025 base_url = self.base_url
1026 action = self.get('action')
1027 if base_url and action is not None:
1028 return urljoin(base_url, action)
1029 else:
1030 return action
1031
1032 @action.setter
1033 def action(self, value):
1034 self.set('action', value)
1035
1036 @action.deleter
1037 def action(self):
1038 attrib = self.attrib
1039 if 'action' in attrib:
1040 del attrib['action']
1041
1042 @property
1043 def method(self):
1044 """
1045 Get/set the form's method. Always returns a capitalized
1046 string, and defaults to ``'GET'``
1047 """
1048 return self.get('method', 'GET').upper()
1049
1050 @method.setter
1051 def method(self, value):
1052 self.set('method', value.upper())
1053
1054
1055HtmlElementClassLookup._default_element_classes['form'] = FormElement
1056
1057
1058def submit_form(form, extra_values=None, open_http=None):
1059 """
1060 Helper function to submit a form. Returns a file-like object, as from
1061 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
1062 which shows the URL if there were any redirects.
1063
1064 You can use this like::
1065
1066 form = doc.forms[0]
1067 form.inputs['foo'].value = 'bar' # etc
1068 response = form.submit()
1069 doc = parse(response)
1070 doc.make_links_absolute(response.geturl())
1071
1072 To change the HTTP requester, pass a function as ``open_http`` keyword
1073 argument that opens the URL for you. The function must have the following
1074 signature::
1075
1076 open_http(method, URL, values)
1077
1078 The action is one of 'GET' or 'POST', the URL is the target URL as a
1079 string, and the values are a sequence of ``(name, value)`` tuples with the
1080 form data.
1081 """
1082 values = form.form_values()
1083 if extra_values:
1084 if hasattr(extra_values, 'items'):
1085 extra_values = extra_values.items()
1086 values.extend(extra_values)
1087 if open_http is None:
1088 open_http = open_http_urllib
1089 if form.action:
1090 url = form.action
1091 else:
1092 url = form.base_url
1093 return open_http(form.method, url, values)
1094
1095
1096def open_http_urllib(method, url, values):
1097 if not url:
1098 raise ValueError("cannot submit, no URL provided")
1099 ## FIXME: should test that it's not a relative URL or something
1100 try:
1101 from urllib import urlencode, urlopen
1102 except ImportError: # Python 3
1103 from urllib.request import urlopen
1104 from urllib.parse import urlencode
1105 if method == 'GET':
1106 if '?' in url:
1107 url += '&'
1108 else:
1109 url += '?'
1110 url += urlencode(values)
1111 data = None
1112 else:
1113 data = urlencode(values)
1114 if not isinstance(data, bytes):
1115 data = data.encode('ASCII')
1116 return urlopen(url, data)
1117
1118
1119class FieldsDict(MutableMapping):
1120
1121 def __init__(self, inputs):
1122 self.inputs = inputs
1123 def __getitem__(self, item):
1124 return self.inputs[item].value
1125 def __setitem__(self, item, value):
1126 self.inputs[item].value = value
1127 def __delitem__(self, item):
1128 raise KeyError(
1129 "You cannot remove keys from ElementDict")
1130 def keys(self):
1131 return self.inputs.keys()
1132 def __contains__(self, item):
1133 return item in self.inputs
1134 def __iter__(self):
1135 return iter(self.inputs.keys())
1136 def __len__(self):
1137 return len(self.inputs)
1138
1139 def __repr__(self):
1140 return '<%s for form %s>' % (
1141 self.__class__.__name__,
1142 self.inputs.form._name())
1143
1144
1145class InputGetter:
1146
1147 """
1148 An accessor that represents all the input fields in a form.
1149
1150 You can get fields by name from this, with
1151 ``form.inputs['field_name']``. If there are a set of checkboxes
1152 with the same name, they are returned as a list (a `CheckboxGroup`
1153 which also allows value setting). Radio inputs are handled
1154 similarly. Use ``.keys()`` and ``.items()`` to process all fields
1155 in this way.
1156
1157 You can also iterate over this to get all input elements. This
1158 won't return the same thing as if you get all the names, as
1159 checkboxes and radio elements are returned individually.
1160 """
1161
1162 def __init__(self, form):
1163 self.form = form
1164
1165 def __repr__(self):
1166 return '<%s for form %s>' % (
1167 self.__class__.__name__,
1168 self.form._name())
1169
1170 ## FIXME: there should be more methods, and it's unclear if this is
1171 ## a dictionary-like object or list-like object
1172
1173 def __getitem__(self, name):
1174 fields = [field for field in self if field.name == name]
1175 if not fields:
1176 raise KeyError("No input element with the name %r" % name)
1177
1178 input_type = fields[0].get('type')
1179 if input_type == 'radio' and len(fields) > 1:
1180 group = RadioGroup(fields)
1181 group.name = name
1182 return group
1183 elif input_type == 'checkbox' and len(fields) > 1:
1184 group = CheckboxGroup(fields)
1185 group.name = name
1186 return group
1187 else:
1188 # I don't like throwing away elements like this
1189 return fields[0]
1190
1191 def __contains__(self, name):
1192 for field in self:
1193 if field.name == name:
1194 return True
1195 return False
1196
1197 def keys(self):
1198 """
1199 Returns all unique field names, in document order.
1200
1201 :return: A list of all unique field names.
1202 """
1203 names = []
1204 seen = {None}
1205 for el in self:
1206 name = el.name
1207 if name not in seen:
1208 names.append(name)
1209 seen.add(name)
1210 return names
1211
1212 def items(self):
1213 """
1214 Returns all fields with their names, similar to dict.items().
1215
1216 :return: A list of (name, field) tuples.
1217 """
1218 items = []
1219 seen = set()
1220 for el in self:
1221 name = el.name
1222 if name not in seen:
1223 seen.add(name)
1224 items.append((name, self[name]))
1225 return items
1226
1227 def __iter__(self):
1228 return self.form.iter('select', 'input', 'textarea')
1229
1230 def __len__(self):
1231 return sum(1 for _ in self)
1232
1233
1234class InputMixin:
1235 """
1236 Mix-in for all input elements (input, select, and textarea)
1237 """
1238 @property
1239 def name(self):
1240 """
1241 Get/set the name of the element
1242 """
1243 return self.get('name')
1244
1245 @name.setter
1246 def name(self, value):
1247 self.set('name', value)
1248
1249 @name.deleter
1250 def name(self):
1251 attrib = self.attrib
1252 if 'name' in attrib:
1253 del attrib['name']
1254
1255 def __repr__(self):
1256 type_name = getattr(self, 'type', None)
1257 if type_name:
1258 type_name = ' type=%r' % type_name
1259 else:
1260 type_name = ''
1261 return '<%s %x name=%r%s>' % (
1262 self.__class__.__name__, id(self), self.name, type_name)
1263
1264
1265class TextareaElement(InputMixin, HtmlElement):
1266 """
1267 ``<textarea>`` element. You can get the name with ``.name`` and
1268 get/set the value with ``.value``
1269 """
1270 @property
1271 def value(self):
1272 """
1273 Get/set the value (which is the contents of this element)
1274 """
1275 content = self.text or ''
1276 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1277 serialisation_method = 'xml'
1278 else:
1279 serialisation_method = 'html'
1280 for el in self:
1281 # it's rare that we actually get here, so let's not use ''.join()
1282 content += etree.tostring(
1283 el, method=serialisation_method, encoding='unicode')
1284 return content
1285
1286 @value.setter
1287 def value(self, value):
1288 del self[:]
1289 self.text = value
1290
1291 @value.deleter
1292 def value(self):
1293 self.text = ''
1294 del self[:]
1295
1296
1297HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1298
1299
1300class SelectElement(InputMixin, HtmlElement):
1301 """
1302 ``<select>`` element. You can get the name with ``.name``.
1303
1304 ``.value`` will be the value of the selected option, unless this
1305 is a multi-select element (``<select multiple>``), in which case
1306 it will be a set-like object. In either case ``.value_options``
1307 gives the possible values.
1308
1309 The boolean attribute ``.multiple`` shows if this is a
1310 multi-select.
1311 """
1312 @property
1313 def value(self):
1314 """
1315 Get/set the value of this select (the selected option).
1316
1317 If this is a multi-select, this is a set-like object that
1318 represents all the selected options.
1319 """
1320 if self.multiple:
1321 return MultipleSelectOptions(self)
1322 options = _options_xpath(self)
1323
1324 try:
1325 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1326 except StopIteration:
1327 try:
1328 selected_option = next(el for el in options if el.get('disabled') is None)
1329 except StopIteration:
1330 return None
1331 value = selected_option.get('value')
1332 if value is None:
1333 value = (selected_option.text or '').strip()
1334 return value
1335
1336 @value.setter
1337 def value(self, value):
1338 if self.multiple:
1339 if isinstance(value, str):
1340 raise TypeError("You must pass in a sequence")
1341 values = self.value
1342 values.clear()
1343 values.update(value)
1344 return
1345 checked_option = None
1346 if value is not None:
1347 for el in _options_xpath(self):
1348 opt_value = el.get('value')
1349 if opt_value is None:
1350 opt_value = (el.text or '').strip()
1351 if opt_value == value:
1352 checked_option = el
1353 break
1354 else:
1355 raise ValueError(
1356 "There is no option with the value of %r" % value)
1357 for el in _options_xpath(self):
1358 if 'selected' in el.attrib:
1359 del el.attrib['selected']
1360 if checked_option is not None:
1361 checked_option.set('selected', '')
1362
1363 @value.deleter
1364 def value(self):
1365 # FIXME: should del be allowed at all?
1366 if self.multiple:
1367 self.value.clear()
1368 else:
1369 self.value = None
1370
1371 @property
1372 def value_options(self):
1373 """
1374 All the possible values this select can have (the ``value``
1375 attribute of all the ``<option>`` elements.
1376 """
1377 options = []
1378 for el in _options_xpath(self):
1379 value = el.get('value')
1380 if value is None:
1381 value = (el.text or '').strip()
1382 options.append(value)
1383 return options
1384
1385 @property
1386 def multiple(self):
1387 """
1388 Boolean attribute: is there a ``multiple`` attribute on this element.
1389 """
1390 return 'multiple' in self.attrib
1391
1392 @multiple.setter
1393 def multiple(self, value):
1394 if value:
1395 self.set('multiple', '')
1396 elif 'multiple' in self.attrib:
1397 del self.attrib['multiple']
1398
1399
1400HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1401
1402
1403class MultipleSelectOptions(SetMixin):
1404 """
1405 Represents all the selected options in a ``<select multiple>`` element.
1406
1407 You can add to this set-like option to select an option, or remove
1408 to unselect the option.
1409 """
1410
1411 def __init__(self, select):
1412 self.select = select
1413
1414 @property
1415 def options(self):
1416 """
1417 Iterator of all the ``<option>`` elements.
1418 """
1419 return iter(_options_xpath(self.select))
1420
1421 def __iter__(self):
1422 for option in self.options:
1423 if 'selected' in option.attrib:
1424 opt_value = option.get('value')
1425 if opt_value is None:
1426 opt_value = (option.text or '').strip()
1427 yield opt_value
1428
1429 def add(self, item):
1430 for option in self.options:
1431 opt_value = option.get('value')
1432 if opt_value is None:
1433 opt_value = (option.text or '').strip()
1434 if opt_value == item:
1435 option.set('selected', '')
1436 break
1437 else:
1438 raise ValueError(
1439 "There is no option with the value %r" % item)
1440
1441 def remove(self, item):
1442 for option in self.options:
1443 opt_value = option.get('value')
1444 if opt_value is None:
1445 opt_value = (option.text or '').strip()
1446 if opt_value == item:
1447 if 'selected' in option.attrib:
1448 del option.attrib['selected']
1449 else:
1450 raise ValueError(
1451 "The option %r is not currently selected" % item)
1452 break
1453 else:
1454 raise ValueError(
1455 "There is not option with the value %r" % item)
1456
1457 def __repr__(self):
1458 return '<%s {%s} for select name=%r>' % (
1459 self.__class__.__name__,
1460 ', '.join([repr(v) for v in self]),
1461 self.select.name)
1462
1463
1464class RadioGroup(list):
1465 """
1466 This object represents several ``<input type=radio>`` elements
1467 that have the same name.
1468
1469 You can use this like a list, but also use the property
1470 ``.value`` to check/uncheck inputs. Also you can use
1471 ``.value_options`` to get the possible values.
1472 """
1473 @property
1474 def value(self):
1475 """
1476 Get/set the value, which checks the radio with that value (and
1477 unchecks any other value).
1478 """
1479 for el in self:
1480 if 'checked' in el.attrib:
1481 return el.get('value')
1482 return None
1483
1484 @value.setter
1485 def value(self, value):
1486 checked_option = None
1487 if value is not None:
1488 for el in self:
1489 if el.get('value') == value:
1490 checked_option = el
1491 break
1492 else:
1493 raise ValueError("There is no radio input with the value %r" % value)
1494 for el in self:
1495 if 'checked' in el.attrib:
1496 del el.attrib['checked']
1497 if checked_option is not None:
1498 checked_option.set('checked', '')
1499
1500 @value.deleter
1501 def value(self):
1502 self.value = None
1503
1504 @property
1505 def value_options(self):
1506 """
1507 Returns a list of all the possible values.
1508 """
1509 return [el.get('value') for el in self]
1510
1511 def __repr__(self):
1512 return '%s(%s)' % (
1513 self.__class__.__name__,
1514 list.__repr__(self))
1515
1516
1517class CheckboxGroup(list):
1518 """
1519 Represents a group of checkboxes (``<input type=checkbox>``) that
1520 have the same name.
1521
1522 In addition to using this like a list, the ``.value`` attribute
1523 returns a set-like object that you can add to or remove from to
1524 check and uncheck checkboxes. You can also use ``.value_options``
1525 to get the possible values.
1526 """
1527 @property
1528 def value(self):
1529 """
1530 Return a set-like object that can be modified to check or
1531 uncheck individual checkboxes according to their value.
1532 """
1533 return CheckboxValues(self)
1534
1535 @value.setter
1536 def value(self, value):
1537 values = self.value
1538 values.clear()
1539 if not hasattr(value, '__iter__'):
1540 raise ValueError(
1541 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1542 % (self[0].name, value))
1543 values.update(value)
1544
1545 @value.deleter
1546 def value(self):
1547 self.value.clear()
1548
1549 @property
1550 def value_options(self):
1551 """
1552 Returns a list of all the possible values.
1553 """
1554 return [el.get('value') for el in self]
1555
1556 def __repr__(self):
1557 return '%s(%s)' % (
1558 self.__class__.__name__, list.__repr__(self))
1559
1560
1561class CheckboxValues(SetMixin):
1562 """
1563 Represents the values of the checked checkboxes in a group of
1564 checkboxes with the same name.
1565 """
1566
1567 def __init__(self, group):
1568 self.group = group
1569
1570 def __iter__(self):
1571 return iter([
1572 el.get('value')
1573 for el in self.group
1574 if 'checked' in el.attrib])
1575
1576 def add(self, value):
1577 for el in self.group:
1578 if el.get('value') == value:
1579 el.set('checked', '')
1580 break
1581 else:
1582 raise KeyError("No checkbox with value %r" % value)
1583
1584 def remove(self, value):
1585 for el in self.group:
1586 if el.get('value') == value:
1587 if 'checked' in el.attrib:
1588 del el.attrib['checked']
1589 else:
1590 raise KeyError(
1591 "The checkbox with value %r was already unchecked" % value)
1592 break
1593 else:
1594 raise KeyError(
1595 "No checkbox with value %r" % value)
1596
1597 def __repr__(self):
1598 return '<%s {%s} for checkboxes name=%r>' % (
1599 self.__class__.__name__,
1600 ', '.join([repr(v) for v in self]),
1601 self.group.name)
1602
1603
1604class InputElement(InputMixin, HtmlElement):
1605 """
1606 Represents an ``<input>`` element.
1607
1608 You can get the type with ``.type`` (which is lower-cased and
1609 defaults to ``'text'``).
1610
1611 Also you can get and set the value with ``.value``
1612
1613 Checkboxes and radios have the attribute ``input.checkable ==
1614 True`` (for all others it is false) and a boolean attribute
1615 ``.checked``.
1616
1617 """
1618
1619 ## FIXME: I'm a little uncomfortable with the use of .checked
1620 @property
1621 def value(self):
1622 """
1623 Get/set the value of this element, using the ``value`` attribute.
1624
1625 Also, if this is a checkbox and it has no value, this defaults
1626 to ``'on'``. If it is a checkbox or radio that is not
1627 checked, this returns None.
1628 """
1629 if self.checkable:
1630 if self.checked:
1631 return self.get('value') or 'on'
1632 else:
1633 return None
1634 return self.get('value')
1635
1636 @value.setter
1637 def value(self, value):
1638 if self.checkable:
1639 if not value:
1640 self.checked = False
1641 else:
1642 self.checked = True
1643 if isinstance(value, str):
1644 self.set('value', value)
1645 else:
1646 self.set('value', value)
1647
1648 @value.deleter
1649 def value(self):
1650 if self.checkable:
1651 self.checked = False
1652 else:
1653 if 'value' in self.attrib:
1654 del self.attrib['value']
1655
1656 @property
1657 def type(self):
1658 """
1659 Return the type of this element (using the type attribute).
1660 """
1661 return self.get('type', 'text').lower()
1662
1663 @type.setter
1664 def type(self, value):
1665 self.set('type', value)
1666
1667 @property
1668 def checkable(self):
1669 """
1670 Boolean: can this element be checked?
1671 """
1672 return self.type in ('checkbox', 'radio')
1673
1674 @property
1675 def checked(self):
1676 """
1677 Boolean attribute to get/set the presence of the ``checked``
1678 attribute.
1679
1680 You can only use this on checkable input types.
1681 """
1682 if not self.checkable:
1683 raise AttributeError('Not a checkable input type')
1684 return 'checked' in self.attrib
1685
1686 @checked.setter
1687 def checked(self, value):
1688 if not self.checkable:
1689 raise AttributeError('Not a checkable input type')
1690 if value:
1691 self.set('checked', '')
1692 else:
1693 attrib = self.attrib
1694 if 'checked' in attrib:
1695 del attrib['checked']
1696
1697
1698HtmlElementClassLookup._default_element_classes['input'] = InputElement
1699
1700
1701class LabelElement(HtmlElement):
1702 """
1703 Represents a ``<label>`` element.
1704
1705 Label elements are linked to other elements with their ``for``
1706 attribute. You can access this element with ``label.for_element``.
1707 """
1708 @property
1709 def for_element(self):
1710 """
1711 Get/set the element this label points to. Return None if it
1712 can't be found.
1713 """
1714 id = self.get('for')
1715 if not id:
1716 return None
1717 return self.body.get_element_by_id(id)
1718
1719 @for_element.setter
1720 def for_element(self, other):
1721 id = other.get('id')
1722 if not id:
1723 raise TypeError(
1724 "Element %r has no id attribute" % other)
1725 self.set('for', id)
1726
1727 @for_element.deleter
1728 def for_element(self):
1729 attrib = self.attrib
1730 if 'id' in attrib:
1731 del attrib['id']
1732
1733
1734HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1735
1736
1737############################################################
1738## Serialization
1739############################################################
1740
1741def html_to_xhtml(html):
1742 """Convert all tags in an HTML tree to XHTML by moving them to the
1743 XHTML namespace.
1744 """
1745 try:
1746 html = html.getroot()
1747 except AttributeError:
1748 pass
1749 prefix = "{%s}" % XHTML_NAMESPACE
1750 for el in html.iter(etree.Element):
1751 tag = el.tag
1752 if tag[0] != '{':
1753 el.tag = prefix + tag
1754
1755
1756def xhtml_to_html(xhtml):
1757 """Convert all tags in an XHTML tree to HTML by removing their
1758 XHTML namespace.
1759 """
1760 try:
1761 xhtml = xhtml.getroot()
1762 except AttributeError:
1763 pass
1764 prefix = "{%s}" % XHTML_NAMESPACE
1765 prefix_len = len(prefix)
1766 for el in xhtml.iter(prefix + "*"):
1767 el.tag = el.tag[prefix_len:]
1768
1769
1770# This isn't a general match, but it's a match for what libxml2
1771# specifically serialises:
1772__str_replace_meta_content_type = re.compile(
1773 r'<meta http-equiv="Content-Type"[^>]*>').sub
1774__bytes_replace_meta_content_type = re.compile(
1775 br'<meta http-equiv="Content-Type"[^>]*>').sub
1776
1777
1778def tostring(doc, pretty_print=False, include_meta_content_type=False,
1779 encoding=None, method="html", with_tail=True, doctype=None):
1780 """Return an HTML string representation of the document.
1781
1782 Note: if include_meta_content_type is true this will create a
1783 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1784 regardless of the value of include_meta_content_type any existing
1785 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1786
1787 The ``encoding`` argument controls the output encoding (defaults to
1788 ASCII, with &#...; character references for any characters outside
1789 of ASCII). Note that you can pass the name ``'unicode'`` as
1790 ``encoding`` argument to serialise to a Unicode string.
1791
1792 The ``method`` argument defines the output method. It defaults to
1793 'html', but can also be 'xml' for xhtml output, or 'text' to
1794 serialise to plain text without markup.
1795
1796 To leave out the tail text of the top-level element that is being
1797 serialised, pass ``with_tail=False``.
1798
1799 The ``doctype`` option allows passing in a plain string that will
1800 be serialised before the XML tree. Note that passing in non
1801 well-formed content here will make the XML output non well-formed.
1802 Also, an existing doctype in the document tree will not be removed
1803 when serialising an ElementTree instance.
1804
1805 Example::
1806
1807 >>> from lxml import html
1808 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1809
1810 >>> html.tostring(root)
1811 b'<p>Hello<br>world!</p>'
1812 >>> html.tostring(root, method='html')
1813 b'<p>Hello<br>world!</p>'
1814
1815 >>> html.tostring(root, method='xml')
1816 b'<p>Hello<br/>world!</p>'
1817
1818 >>> html.tostring(root, method='text')
1819 b'Helloworld!'
1820
1821 >>> html.tostring(root, method='text', encoding='unicode')
1822 u'Helloworld!'
1823
1824 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1825 >>> html.tostring(root[0], method='text', encoding='unicode')
1826 u'Helloworld!TAIL'
1827
1828 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1829 u'Helloworld!'
1830
1831 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1832 >>> html.tostring(doc, method='html', encoding='unicode')
1833 u'<html><body><p>Hello<br>world!</p></body></html>'
1834
1835 >>> print(html.tostring(doc, method='html', encoding='unicode',
1836 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1837 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1838 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1839 <html><body><p>Hello<br>world!</p></body></html>
1840 """
1841 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1842 encoding=encoding, with_tail=with_tail,
1843 doctype=doctype)
1844 if method == 'html' and not include_meta_content_type:
1845 if isinstance(html, str):
1846 html = __str_replace_meta_content_type('', html)
1847 else:
1848 html = __bytes_replace_meta_content_type(b'', html)
1849 return html
1850
1851
1852tostring.__doc__ = __fix_docstring(tostring.__doc__)
1853
1854
1855def open_in_browser(doc, encoding=None):
1856 """
1857 Open the HTML document in a web browser, saving it to a temporary
1858 file to open it. Note that this does not delete the file after
1859 use. This is mainly meant for debugging.
1860 """
1861 import os
1862 import webbrowser
1863 import tempfile
1864 if not isinstance(doc, etree._ElementTree):
1865 doc = etree.ElementTree(doc)
1866 handle, fn = tempfile.mkstemp(suffix='.html')
1867 f = os.fdopen(handle, 'wb')
1868 try:
1869 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1870 finally:
1871 # we leak the file itself here, but we should at least close it
1872 f.close()
1873 url = 'file://' + fn.replace(os.path.sep, '/')
1874 print(url)
1875 webbrowser.open(url)
1876
1877
1878################################################################################
1879# configure Element class lookup
1880################################################################################
1881
1882class HTMLParser(etree.HTMLParser):
1883 """An HTML parser that is configured to return lxml.html Element
1884 objects.
1885 """
1886 def __init__(self, **kwargs):
1887 super().__init__(**kwargs)
1888 self.set_element_class_lookup(HtmlElementClassLookup())
1889
1890
1891class XHTMLParser(etree.XMLParser):
1892 """An XML parser that is configured to return lxml.html Element
1893 objects.
1894
1895 Note that this parser is not really XHTML aware unless you let it
1896 load a DTD that declares the HTML entities. To do this, make sure
1897 you have the XHTML DTDs installed in your catalogs, and create the
1898 parser like this::
1899
1900 >>> parser = XHTMLParser(load_dtd=True)
1901
1902 If you additionally want to validate the document, use this::
1903
1904 >>> parser = XHTMLParser(dtd_validation=True)
1905
1906 For catalog support, see http://www.xmlsoft.org/catalog.html.
1907 """
1908 def __init__(self, **kwargs):
1909 super().__init__(**kwargs)
1910 self.set_element_class_lookup(HtmlElementClassLookup())
1911
1912
1913def Element(*args, **kw):
1914 """Create a new HTML Element.
1915
1916 This can also be used for XHTML documents.
1917 """
1918 v = html_parser.makeelement(*args, **kw)
1919 return v
1920
1921
1922html_parser = HTMLParser()
1923xhtml_parser = XHTMLParser()