1# Copyright (c) 2004 Ian Bicking. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# 1. Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# 2. Redistributions in binary form must reproduce the above copyright
11# notice, this list of conditions and the following disclaimer in
12# the documentation and/or other materials provided with the
13# distribution.
14#
15# 3. Neither the name of Ian Bicking nor the names of its contributors may
16# be used to endorse or promote products derived from this software
17# without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""The ``lxml.html`` tool set for HTML handling.
32"""
33
34
35__all__ = [
36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
38 'find_rel_links', 'find_class', 'make_links_absolute',
39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
40
41
42import copy
43import re
44
45from collections.abc import MutableMapping, MutableSet
46from functools import partial
47from urllib.parse import urljoin
48
49from .. import etree
50from . import defs
51from ._setmixin import SetMixin
52
53
54def __fix_docstring(s):
55 # TODO: remove and clean up doctests
56 if not s:
57 return s
58 sub = re.compile(r"^(\s*)u'", re.M).sub
59 return sub(r"\1'", s)
60
61
62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
63
64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
65 namespaces={'x':XHTML_NAMESPACE})
66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
67 namespaces={'x':XHTML_NAMESPACE})
68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
69 namespaces={'x':XHTML_NAMESPACE})
70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
73_collect_string_content = etree.XPath("string()", smart_strings=False)
74_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
77 namespaces={'x':XHTML_NAMESPACE})
78_archive_re = re.compile(r'[^ ]+')
79_parse_meta_refresh_url = re.compile(
80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
81
82
83def _unquote_match(s, pos):
84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
85 return s[1:-1], pos+1
86 else:
87 return s,pos
88
89
90def _transform_result(typ, result):
91 """Convert the result back into the input type.
92 """
93 if issubclass(typ, bytes):
94 return tostring(result, encoding='utf-8')
95 elif issubclass(typ, str):
96 return tostring(result, encoding='unicode')
97 else:
98 return result
99
100
101def _nons(tag):
102 if isinstance(tag, str):
103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
104 return tag.split('}')[-1]
105 return tag
106
107
108class Classes(MutableSet):
109 """Provides access to an element's class attribute as a set-like collection.
110 Usage::
111
112 >>> el = fromstring('<p class="hidden large">Text</p>')
113 >>> classes = el.classes # or: classes = Classes(el.attrib)
114 >>> classes |= ['block', 'paragraph']
115 >>> el.get('class')
116 'hidden large block paragraph'
117 >>> classes.toggle('hidden')
118 False
119 >>> el.get('class')
120 'large block paragraph'
121 >>> classes -= ('some', 'classes', 'block')
122 >>> el.get('class')
123 'large paragraph'
124 """
125 def __init__(self, attributes):
126 self._attributes = attributes
127 self._get_class_value = partial(attributes.get, 'class', '')
128
129 def add(self, value):
130 """
131 Add a class.
132
133 This has no effect if the class is already present.
134 """
135 if not value or re.search(r'\s', value):
136 raise ValueError("Invalid class name: %r" % value)
137 classes = self._get_class_value().split()
138 if value in classes:
139 return
140 classes.append(value)
141 self._attributes['class'] = ' '.join(classes)
142
143 def discard(self, value):
144 """
145 Remove a class if it is currently present.
146
147 If the class is not present, do nothing.
148 """
149 if not value or re.search(r'\s', value):
150 raise ValueError("Invalid class name: %r" % value)
151 classes = [name for name in self._get_class_value().split()
152 if name != value]
153 if classes:
154 self._attributes['class'] = ' '.join(classes)
155 elif 'class' in self._attributes:
156 del self._attributes['class']
157
158 def remove(self, value):
159 """
160 Remove a class; it must currently be present.
161
162 If the class is not present, raise a KeyError.
163 """
164 if not value or re.search(r'\s', value):
165 raise ValueError("Invalid class name: %r" % value)
166 super().remove(value)
167
168 def __contains__(self, name):
169 classes = self._get_class_value()
170 return name in classes and name in classes.split()
171
172 def __iter__(self):
173 return iter(self._get_class_value().split())
174
175 def __len__(self):
176 return len(self._get_class_value().split())
177
178 # non-standard methods
179
180 def update(self, values):
181 """
182 Add all names from 'values'.
183 """
184 classes = self._get_class_value().split()
185 extended = False
186 for value in values:
187 if value not in classes:
188 classes.append(value)
189 extended = True
190 if extended:
191 self._attributes['class'] = ' '.join(classes)
192
193 def toggle(self, value):
194 """
195 Add a class name if it isn't there yet, or remove it if it exists.
196
197 Returns true if the class was added (and is now enabled) and
198 false if it was removed (and is now disabled).
199 """
200 if not value or re.search(r'\s', value):
201 raise ValueError("Invalid class name: %r" % value)
202 classes = self._get_class_value().split()
203 try:
204 classes.remove(value)
205 enabled = False
206 except ValueError:
207 classes.append(value)
208 enabled = True
209 if classes:
210 self._attributes['class'] = ' '.join(classes)
211 else:
212 del self._attributes['class']
213 return enabled
214
215
216class HtmlMixin:
217
218 def set(self, key, value=None):
219 """set(self, key, value=None)
220
221 Sets an element attribute. If no value is provided, or if the value is None,
222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
223 for ``form.set('novalidate')``.
224 """
225 super().set(key, value)
226
227 @property
228 def classes(self):
229 """
230 A set-like wrapper around the 'class' attribute.
231 """
232 return Classes(self.attrib)
233
234 @classes.setter
235 def classes(self, classes):
236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
237 value = classes._get_class_value()
238 if value:
239 self.set('class', value)
240 elif self.get('class') is not None:
241 del self.attrib['class']
242
243 @property
244 def base_url(self):
245 """
246 Returns the base URL, given when the page was parsed.
247
248 Use with ``urlparse.urljoin(el.base_url, href)`` to get
249 absolute URLs.
250 """
251 return self.getroottree().docinfo.URL
252
253 @property
254 def forms(self):
255 """
256 Return a list of all the forms
257 """
258 return _forms_xpath(self)
259
260 @property
261 def body(self):
262 """
263 Return the <body> element. Can be called from a child element
264 to get the document's head.
265 """
266 for element in self.getroottree().iter("body", f"{{{XHTML_NAMESPACE}}}body"):
267 return element
268 return None
269
270 @property
271 def head(self):
272 """
273 Returns the <head> element. Can be called from a child
274 element to get the document's head.
275 """
276 for element in self.getroottree().iter("head", f"{{{XHTML_NAMESPACE}}}head"):
277 return element
278 return None
279
280 @property
281 def label(self):
282 """
283 Get or set any <label> element associated with this element.
284 """
285 id = self.get('id')
286 if not id:
287 return None
288 result = _label_xpath(self, id=id)
289 if not result:
290 return None
291 else:
292 return result[0]
293
294 @label.setter
295 def label(self, label):
296 id = self.get('id')
297 if not id:
298 raise TypeError(
299 "You cannot set a label for an element (%r) that has no id"
300 % self)
301 if _nons(label.tag) != 'label':
302 raise TypeError(
303 "You can only assign label to a label element (not %r)"
304 % label)
305 label.set('for', id)
306
307 @label.deleter
308 def label(self):
309 label = self.label
310 if label is not None:
311 del label.attrib['for']
312
313 def drop_tree(self):
314 """
315 Removes this element from the tree, including its children and
316 text. The tail text is joined to the previous element or
317 parent.
318 """
319 parent = self.getparent()
320 assert parent is not None
321 if self.tail:
322 previous = self.getprevious()
323 if previous is None:
324 parent.text = (parent.text or '') + self.tail
325 else:
326 previous.tail = (previous.tail or '') + self.tail
327 parent.remove(self)
328
329 def drop_tag(self):
330 """
331 Remove the tag, but not its children or text. The children and text
332 are merged into the parent.
333
334 Example::
335
336 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
337 >>> h.find('.//b').drop_tag()
338 >>> print(tostring(h, encoding='unicode'))
339 <div>Hello World!</div>
340 """
341 parent = self.getparent()
342 assert parent is not None
343 previous = self.getprevious()
344 if self.text and isinstance(self.tag, str):
345 # not a Comment, etc.
346 if previous is None:
347 parent.text = (parent.text or '') + self.text
348 else:
349 previous.tail = (previous.tail or '') + self.text
350 if self.tail:
351 if len(self):
352 last = self[-1]
353 last.tail = (last.tail or '') + self.tail
354 elif previous is None:
355 parent.text = (parent.text or '') + self.tail
356 else:
357 previous.tail = (previous.tail or '') + self.tail
358 index = parent.index(self)
359 parent[index:index+1] = self[:]
360
361 def find_rel_links(self, rel):
362 """
363 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
364 """
365 rel = rel.lower()
366 return [el for el in _rel_links_xpath(self)
367 if el.get('rel').lower() == rel]
368
369 def find_class(self, class_name):
370 """
371 Find any elements with the given class name.
372 """
373 return _class_xpath(self, class_name=class_name)
374
375 def get_element_by_id(self, id, *default):
376 """
377 Get the first element in a document with the given id. If none is
378 found, return the default argument if provided or raise KeyError
379 otherwise.
380
381 Note that there can be more than one element with the same id,
382 and this isn't uncommon in HTML documents found in the wild.
383 Browsers return only the first match, and this function does
384 the same.
385 """
386 try:
387 # FIXME: should this check for multiple matches?
388 # browsers just return the first one
389 return _id_xpath(self, id=id)[0]
390 except IndexError:
391 if default:
392 return default[0]
393 else:
394 raise KeyError(id)
395
396 def text_content(self):
397 """
398 Return the text content of the tag (and the text in any children).
399 """
400 return _collect_string_content(self)
401
402 def cssselect(self, expr, translator='html'):
403 """
404 Run the CSS expression on this element and its children,
405 returning a list of the results.
406
407 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
408 -- note that pre-compiling the expression can provide a substantial
409 speedup.
410 """
411 # Do the import here to make the dependency optional.
412 from lxml.cssselect import CSSSelector
413 return CSSSelector(expr, translator=translator)(self)
414
415 ########################################
416 ## Link functions
417 ########################################
418
419 def make_links_absolute(self, base_url=None, resolve_base_href=True,
420 handle_failures=None):
421 """
422 Make all links in the document absolute, given the
423 ``base_url`` for the document (the full URL where the document
424 came from), or if no ``base_url`` is given, then the ``.base_url``
425 of the document.
426
427 If ``resolve_base_href`` is true, then any ``<base href>``
428 tags in the document are used *and* removed from the document.
429 If it is false then any such tag is ignored.
430
431 If ``handle_failures`` is None (default), a failure to process
432 a URL will abort the processing. If set to 'ignore', errors
433 are ignored. If set to 'discard', failing URLs will be removed.
434 """
435 if base_url is None:
436 base_url = self.base_url
437 if base_url is None:
438 raise TypeError(
439 "No base_url given, and the document has no base_url")
440 if resolve_base_href:
441 self.resolve_base_href()
442
443 if handle_failures == 'ignore':
444 def link_repl(href):
445 try:
446 return urljoin(base_url, href)
447 except ValueError:
448 return href
449 elif handle_failures == 'discard':
450 def link_repl(href):
451 try:
452 return urljoin(base_url, href)
453 except ValueError:
454 return None
455 elif handle_failures is None:
456 def link_repl(href):
457 return urljoin(base_url, href)
458 else:
459 raise ValueError(
460 "unexpected value for handle_failures: %r" % handle_failures)
461
462 self.rewrite_links(link_repl)
463
464 def resolve_base_href(self, handle_failures=None):
465 """
466 Find any ``<base href>`` tag in the document, and apply its
467 values to all links found in the document. Also remove the
468 tag once it has been applied.
469
470 If ``handle_failures`` is None (default), a failure to process
471 a URL will abort the processing. If set to 'ignore', errors
472 are ignored. If set to 'discard', failing URLs will be removed.
473 """
474 base_href = None
475 basetags = self.xpath('//base[@href]|//x:base[@href]',
476 namespaces={'x': XHTML_NAMESPACE})
477 for b in basetags:
478 base_href = b.get('href')
479 b.drop_tree()
480 if not base_href:
481 return
482 self.make_links_absolute(base_href, resolve_base_href=False,
483 handle_failures=handle_failures)
484
485 def iterlinks(self):
486 """
487 Yield (element, attribute, link, pos), where attribute may be None
488 (indicating the link is in the text). ``pos`` is the position
489 where the link occurs; often 0, but sometimes something else in
490 the case of links in stylesheets or style tags.
491
492 Note: <base href> is *not* taken into account in any way. The
493 link you get is exactly the link in the document.
494
495 Note: multiple links inside of a single text string or
496 attribute value are returned in reversed order. This makes it
497 possible to replace or delete them from the text string value
498 based on their reported text positions. Otherwise, a
499 modification at one text position can change the positions of
500 links reported later on.
501 """
502 link_attrs = defs.link_attrs
503 for el in self.iter(etree.Element):
504 attribs = el.attrib
505 tag = _nons(el.tag)
506 if tag == 'object':
507 codebase = None
508 ## <object> tags have attributes that are relative to
509 ## codebase
510 if 'codebase' in attribs:
511 codebase = el.get('codebase')
512 yield (el, 'codebase', codebase, 0)
513 for attrib in ('classid', 'data'):
514 if attrib in attribs:
515 value = el.get(attrib)
516 if codebase is not None:
517 value = urljoin(codebase, value)
518 yield (el, attrib, value, 0)
519 if 'archive' in attribs:
520 for match in _archive_re.finditer(el.get('archive')):
521 value = match.group(0)
522 if codebase is not None:
523 value = urljoin(codebase, value)
524 yield (el, 'archive', value, match.start())
525 else:
526 for attrib in link_attrs:
527 if attrib in attribs:
528 yield (el, attrib, attribs[attrib], 0)
529 if tag == 'meta':
530 http_equiv = attribs.get('http-equiv', '').lower()
531 if http_equiv == 'refresh':
532 content = attribs.get('content', '')
533 match = _parse_meta_refresh_url(content)
534 url = (match.group('url') if match else content).strip()
535 # unexpected content means the redirect won't work, but we might
536 # as well be permissive and return the entire string.
537 if url:
538 url, pos = _unquote_match(
539 url, match.start('url') if match else content.find(url))
540 yield (el, 'content', url, pos)
541 elif tag == 'param':
542 valuetype = el.get('valuetype') or ''
543 if valuetype.lower() == 'ref':
544 ## FIXME: while it's fine we *find* this link,
545 ## according to the spec we aren't supposed to
546 ## actually change the value, including resolving
547 ## it. It can also still be a link, even if it
548 ## doesn't have a valuetype="ref" (which seems to be the norm)
549 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
550 yield (el, 'value', el.get('value'), 0)
551 elif tag == 'style' and el.text:
552 urls = [
553 # (start_pos, url)
554 _unquote_match(match.group(1), match.start(1))[::-1]
555 for match in _iter_css_urls(el.text)
556 ] + [
557 (match.start(1), match.group(1))
558 for match in _iter_css_imports(el.text)
559 ]
560 if urls:
561 # sort by start pos to bring both match sets back into order
562 # and reverse the list to report correct positions despite
563 # modifications
564 urls.sort(reverse=True)
565 for start, url in urls:
566 yield (el, None, url, start)
567 if 'style' in attribs:
568 urls = list(_iter_css_urls(attribs['style']))
569 if urls:
570 # return in reversed order to simplify in-place modifications
571 for match in urls[::-1]:
572 url, start = _unquote_match(match.group(1), match.start(1))
573 yield (el, 'style', url, start)
574
575 def rewrite_links(self, link_repl_func, resolve_base_href=True,
576 base_href=None):
577 """
578 Rewrite all the links in the document. For each link
579 ``link_repl_func(link)`` will be called, and the return value
580 will replace the old link.
581
582 Note that links may not be absolute (unless you first called
583 ``make_links_absolute()``), and may be internal (e.g.,
584 ``'#anchor'``). They can also be values like
585 ``'mailto:email'`` or ``'javascript:expr'``.
586
587 If you give ``base_href`` then all links passed to
588 ``link_repl_func()`` will take that into account.
589
590 If the ``link_repl_func`` returns None, the attribute or
591 tag text will be removed completely.
592 """
593 if base_href is not None:
594 # FIXME: this can be done in one pass with a wrapper
595 # around link_repl_func
596 self.make_links_absolute(
597 base_href, resolve_base_href=resolve_base_href)
598 elif resolve_base_href:
599 self.resolve_base_href()
600
601 for el, attrib, link, pos in self.iterlinks():
602 new_link = link_repl_func(link.strip())
603 if new_link == link:
604 continue
605 if new_link is None:
606 # Remove the attribute or element content
607 if attrib is None:
608 el.text = ''
609 else:
610 del el.attrib[attrib]
611 continue
612
613 if attrib is None:
614 new = el.text[:pos] + new_link + el.text[pos+len(link):]
615 el.text = new
616 else:
617 cur = el.get(attrib)
618 if not pos and len(cur) == len(link):
619 new = new_link # most common case
620 else:
621 new = cur[:pos] + new_link + cur[pos+len(link):]
622 el.set(attrib, new)
623
624
625class _MethodFunc:
626 """
627 An object that represents a method on an element as a function;
628 the function takes either an element or an HTML string. It
629 returns whatever the function normally returns, or if the function
630 works in-place (and so returns None) it returns a serialized form
631 of the resulting document.
632 """
633 def __init__(self, name, copy=False, source_class=HtmlMixin):
634 self.name = name
635 self.copy = copy
636 self.__doc__ = getattr(source_class, self.name).__doc__
637 def __call__(self, doc, *args, **kw):
638 result_type = type(doc)
639 if isinstance(doc, (str, bytes)):
640 if 'copy' in kw:
641 raise TypeError(
642 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
643 doc = fromstring(doc, **kw)
644 else:
645 if 'copy' in kw:
646 make_a_copy = kw.pop('copy')
647 else:
648 make_a_copy = self.copy
649 if make_a_copy:
650 doc = copy.deepcopy(doc)
651 meth = getattr(doc, self.name)
652 result = meth(*args, **kw)
653 # FIXME: this None test is a bit sloppy
654 if result is None:
655 # Then return what we got in
656 return _transform_result(result_type, doc)
657 else:
658 return result
659
660
661find_rel_links = _MethodFunc('find_rel_links', copy=False)
662find_class = _MethodFunc('find_class', copy=False)
663make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
664resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
665iterlinks = _MethodFunc('iterlinks', copy=False)
666rewrite_links = _MethodFunc('rewrite_links', copy=True)
667
668
669class HtmlComment(HtmlMixin, etree.CommentBase):
670 pass
671
672
673class HtmlElement(HtmlMixin, etree.ElementBase):
674 pass
675
676
677class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):
678 pass
679
680
681class HtmlEntity(HtmlMixin, etree.EntityBase):
682 pass
683
684
685class HtmlElementClassLookup(etree.CustomElementClassLookup):
686 """A lookup scheme for HTML Element classes.
687
688 To create a lookup instance with different Element classes, pass a tag
689 name mapping of Element classes in the ``classes`` keyword argument and/or
690 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
691 The special key '*' denotes a Mixin class that should be mixed into all
692 Element classes.
693 """
694 _default_element_classes = {}
695
696 def __init__(self, classes=None, mixins=None):
697 etree.CustomElementClassLookup.__init__(self)
698 if classes is None:
699 classes = self._default_element_classes.copy()
700 if mixins:
701 mixers = {}
702 for name, value in mixins:
703 if name == '*':
704 for n in classes.keys():
705 mixers.setdefault(n, []).append(value)
706 else:
707 mixers.setdefault(name, []).append(value)
708 for name, mix_bases in mixers.items():
709 cur = classes.get(name, HtmlElement)
710 bases = tuple(mix_bases + [cur])
711 classes[name] = type(cur.__name__, bases, {})
712 self._element_classes = classes
713
714 def lookup(self, node_type, document, namespace, name):
715 if node_type == 'element':
716 return self._element_classes.get(name.lower(), HtmlElement)
717 elif node_type == 'comment':
718 return HtmlComment
719 elif node_type == 'PI':
720 return HtmlProcessingInstruction
721 elif node_type == 'entity':
722 return HtmlEntity
723 # Otherwise normal lookup
724 return None
725
726
727################################################################################
728# parsing
729################################################################################
730
731_looks_like_full_html_unicode = re.compile(
732 r'^\s*<(?:html|!doctype)', re.I).match
733_looks_like_full_html_bytes = re.compile(
734 br'^\s*<(?:html|!doctype)', re.I).match
735
736
737def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
738 if parser is None:
739 parser = html_parser
740 value = etree.fromstring(html, parser, **kw)
741 if value is None:
742 raise etree.ParserError(
743 "Document is empty")
744 if ensure_head_body and value.find('head') is None:
745 value.insert(0, Element('head'))
746 if ensure_head_body and value.find('body') is None:
747 value.append(Element('body'))
748 return value
749
750
751def fragments_fromstring(html, no_leading_text=False, base_url=None,
752 parser=None, **kw):
753 """Parses several HTML elements, returning a list of elements.
754
755 The first item in the list may be a string.
756 If no_leading_text is true, then it will be an error if there is
757 leading text, and it will always be a list of only elements.
758
759 base_url will set the document's base_url attribute
760 (and the tree's docinfo.URL).
761 """
762 if parser is None:
763 parser = html_parser
764 # FIXME: check what happens when you give html with a body, head, etc.
765 if isinstance(html, bytes):
766 if not _looks_like_full_html_bytes(html):
767 # can't use %-formatting in early Py3 versions
768 html = (b'<html><body>' + html +
769 b'</body></html>')
770 else:
771 if not _looks_like_full_html_unicode(html):
772 html = '<html><body>%s</body></html>' % html
773 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
774 assert _nons(doc.tag) == 'html'
775 bodies = [e for e in doc if _nons(e.tag) == 'body']
776 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
777 body = bodies[0]
778 elements = []
779 if no_leading_text and body.text and body.text.strip():
780 raise etree.ParserError(
781 "There is leading text: %r" % body.text)
782 if body.text and body.text.strip():
783 elements.append(body.text)
784 elements.extend(body)
785 # FIXME: removing the reference to the parent artificial document
786 # would be nice
787 return elements
788
789
790def fragment_fromstring(html, create_parent=False, base_url=None,
791 parser=None, **kw):
792 """
793 Parses a single HTML element; it is an error if there is more than
794 one element, or if anything but whitespace precedes or follows the
795 element.
796
797 If ``create_parent`` is true (or is a tag name) then a parent node
798 will be created to encapsulate the HTML in a single element. In this
799 case, leading or trailing text is also allowed, as are multiple elements
800 as result of the parsing.
801
802 Passing a ``base_url`` will set the document's ``base_url`` attribute
803 (and the tree's docinfo.URL).
804 """
805 if parser is None:
806 parser = html_parser
807
808 accept_leading_text = bool(create_parent)
809
810 elements = fragments_fromstring(
811 html, parser=parser, no_leading_text=not accept_leading_text,
812 base_url=base_url, **kw)
813
814 if create_parent:
815 if not isinstance(create_parent, str):
816 create_parent = 'div'
817 new_root = Element(create_parent)
818 if elements:
819 if isinstance(elements[0], str):
820 new_root.text = elements[0]
821 del elements[0]
822 new_root.extend(elements)
823 return new_root
824
825 if not elements:
826 raise etree.ParserError('No elements found')
827 if len(elements) > 1:
828 raise etree.ParserError(
829 "Multiple elements found (%s)"
830 % ', '.join([_element_name(e) for e in elements]))
831 el = elements[0]
832 if el.tail and el.tail.strip():
833 raise etree.ParserError(
834 "Element followed by text: %r" % el.tail)
835 el.tail = None
836 return el
837
838
839def fromstring(html, base_url=None, parser=None, **kw):
840 """
841 Parse the html, returning a single element/document.
842
843 This tries to minimally parse the chunk of text, without knowing if it
844 is a fragment or a document.
845
846 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
847 """
848 if parser is None:
849 parser = html_parser
850 if isinstance(html, bytes):
851 is_full_html = _looks_like_full_html_bytes(html)
852 else:
853 is_full_html = _looks_like_full_html_unicode(html)
854 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
855 if is_full_html:
856 return doc
857 # otherwise, lets parse it out...
858 bodies = doc.findall('body')
859 if not bodies:
860 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
861 if bodies:
862 body = bodies[0]
863 if len(bodies) > 1:
864 # Somehow there are multiple bodies, which is bad, but just
865 # smash them into one body
866 for other_body in bodies[1:]:
867 if other_body.text:
868 if len(body):
869 body[-1].tail = (body[-1].tail or '') + other_body.text
870 else:
871 body.text = (body.text or '') + other_body.text
872 body.extend(other_body)
873 # We'll ignore tail
874 # I guess we are ignoring attributes too
875 other_body.drop_tree()
876 else:
877 body = None
878 heads = doc.findall('head')
879 if not heads:
880 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
881 if heads:
882 # Well, we have some sort of structure, so lets keep it all
883 head = heads[0]
884 if len(heads) > 1:
885 for other_head in heads[1:]:
886 head.extend(other_head)
887 # We don't care about text or tail in a head
888 other_head.drop_tree()
889 return doc
890 if body is None:
891 return doc
892 if (len(body) == 1 and (not body.text or not body.text.strip())
893 and (not body[-1].tail or not body[-1].tail.strip())):
894 # The body has just one element, so it was probably a single
895 # element passed in
896 return body[0]
897 # Now we have a body which represents a bunch of tags which have the
898 # content that was passed in. We will create a fake container, which
899 # is the body tag, except <body> implies too much structure.
900 if _contains_block_level_tag(body):
901 body.tag = 'div'
902 else:
903 body.tag = 'span'
904 return body
905
906
907def parse(filename_or_url, parser=None, base_url=None, **kw):
908 """
909 Parse a filename, URL, or file-like object into an HTML document
910 tree. Note: this returns a tree, not an element. Use
911 ``parse(...).getroot()`` to get the document root.
912
913 You can override the base URL with the ``base_url`` keyword. This
914 is most useful when parsing from a file-like object.
915 """
916 if parser is None:
917 parser = html_parser
918 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
919
920
921def _contains_block_level_tag(el):
922 # FIXME: I could do this with XPath, but would that just be
923 # unnecessarily slow?
924 for el in el.iter(etree.Element):
925 if _nons(el.tag) in defs.block_tags:
926 return True
927 return False
928
929
930def _element_name(el):
931 if isinstance(el, etree.CommentBase):
932 return 'comment'
933 elif isinstance(el, str):
934 return 'string'
935 else:
936 return _nons(el.tag)
937
938
939################################################################################
940# form handling
941################################################################################
942
943class FormElement(HtmlElement):
944 """
945 Represents a <form> element.
946 """
947
948 @property
949 def inputs(self):
950 """
951 Returns an accessor for all the input elements in the form.
952
953 See `InputGetter` for more information about the object.
954 """
955 return InputGetter(self)
956
957 @property
958 def fields(self):
959 """
960 Dictionary-like object that represents all the fields in this
961 form. You can set values in this dictionary to effect the
962 form.
963 """
964 return FieldsDict(self.inputs)
965
966 @fields.setter
967 def fields(self, value):
968 fields = self.fields
969 prev_keys = fields.keys()
970 for key, value in value.items():
971 if key in prev_keys:
972 prev_keys.remove(key)
973 fields[key] = value
974 for key in prev_keys:
975 if key is None:
976 # Case of an unnamed input; these aren't really
977 # expressed in form_values() anyway.
978 continue
979 fields[key] = None
980
981 def _name(self):
982 if self.get('name'):
983 return self.get('name')
984 elif self.get('id'):
985 return '#' + self.get('id')
986 iter_tags = self.body.iter
987 forms = list(iter_tags('form'))
988 if not forms:
989 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
990 return str(forms.index(self))
991
992 def form_values(self):
993 """
994 Return a list of tuples of the field values for the form.
995 This is suitable to be passed to ``urllib.urlencode()``.
996 """
997 results = []
998 for el in self.inputs:
999 name = el.name
1000 if not name or 'disabled' in el.attrib:
1001 continue
1002 tag = _nons(el.tag)
1003 if tag == 'textarea':
1004 results.append((name, el.value))
1005 elif tag == 'select':
1006 value = el.value
1007 if el.multiple:
1008 for v in value:
1009 results.append((name, v))
1010 elif value is not None:
1011 results.append((name, el.value))
1012 else:
1013 assert tag == 'input', (
1014 "Unexpected tag: %r" % el)
1015 if el.checkable and not el.checked:
1016 continue
1017 if el.type in ('submit', 'image', 'reset', 'file'):
1018 continue
1019 value = el.value
1020 if value is not None:
1021 results.append((name, el.value))
1022 return results
1023
1024 @property
1025 def action(self):
1026 """
1027 Get/set the form's ``action`` attribute.
1028 """
1029 base_url = self.base_url
1030 action = self.get('action')
1031 if base_url and action is not None:
1032 return urljoin(base_url, action)
1033 else:
1034 return action
1035
1036 @action.setter
1037 def action(self, value):
1038 self.set('action', value)
1039
1040 @action.deleter
1041 def action(self):
1042 attrib = self.attrib
1043 if 'action' in attrib:
1044 del attrib['action']
1045
1046 @property
1047 def method(self):
1048 """
1049 Get/set the form's method. Always returns a capitalized
1050 string, and defaults to ``'GET'``
1051 """
1052 return self.get('method', 'GET').upper()
1053
1054 @method.setter
1055 def method(self, value):
1056 self.set('method', value.upper())
1057
1058
1059HtmlElementClassLookup._default_element_classes['form'] = FormElement
1060
1061
1062def submit_form(form, extra_values=None, open_http=None):
1063 """
1064 Helper function to submit a form. Returns a file-like object, as from
1065 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
1066 which shows the URL if there were any redirects.
1067
1068 You can use this like::
1069
1070 form = doc.forms[0]
1071 form.inputs['foo'].value = 'bar' # etc
1072 response = form.submit()
1073 doc = parse(response)
1074 doc.make_links_absolute(response.geturl())
1075
1076 To change the HTTP requester, pass a function as ``open_http`` keyword
1077 argument that opens the URL for you. The function must have the following
1078 signature::
1079
1080 open_http(method, URL, values)
1081
1082 The action is one of 'GET' or 'POST', the URL is the target URL as a
1083 string, and the values are a sequence of ``(name, value)`` tuples with the
1084 form data.
1085 """
1086 values = form.form_values()
1087 if extra_values:
1088 if hasattr(extra_values, 'items'):
1089 extra_values = extra_values.items()
1090 values.extend(extra_values)
1091 if open_http is None:
1092 open_http = open_http_urllib
1093 if form.action:
1094 url = form.action
1095 else:
1096 url = form.base_url
1097 return open_http(form.method, url, values)
1098
1099
1100def open_http_urllib(method, url, values):
1101 if not url:
1102 raise ValueError("cannot submit, no URL provided")
1103 ## FIXME: should test that it's not a relative URL or something
1104 try:
1105 from urllib import urlencode, urlopen
1106 except ImportError: # Python 3
1107 from urllib.request import urlopen
1108 from urllib.parse import urlencode
1109 if method == 'GET':
1110 if '?' in url:
1111 url += '&'
1112 else:
1113 url += '?'
1114 url += urlencode(values)
1115 data = None
1116 else:
1117 data = urlencode(values)
1118 if not isinstance(data, bytes):
1119 data = data.encode('ASCII')
1120 return urlopen(url, data)
1121
1122
1123class FieldsDict(MutableMapping):
1124
1125 def __init__(self, inputs):
1126 self.inputs = inputs
1127 def __getitem__(self, item):
1128 return self.inputs[item].value
1129 def __setitem__(self, item, value):
1130 self.inputs[item].value = value
1131 def __delitem__(self, item):
1132 raise KeyError(
1133 "You cannot remove keys from ElementDict")
1134 def keys(self):
1135 return self.inputs.keys()
1136 def __contains__(self, item):
1137 return item in self.inputs
1138 def __iter__(self):
1139 return iter(self.inputs.keys())
1140 def __len__(self):
1141 return len(self.inputs)
1142
1143 def __repr__(self):
1144 return '<%s for form %s>' % (
1145 self.__class__.__name__,
1146 self.inputs.form._name())
1147
1148
1149class InputGetter:
1150
1151 """
1152 An accessor that represents all the input fields in a form.
1153
1154 You can get fields by name from this, with
1155 ``form.inputs['field_name']``. If there are a set of checkboxes
1156 with the same name, they are returned as a list (a `CheckboxGroup`
1157 which also allows value setting). Radio inputs are handled
1158 similarly. Use ``.keys()`` and ``.items()`` to process all fields
1159 in this way.
1160
1161 You can also iterate over this to get all input elements. This
1162 won't return the same thing as if you get all the names, as
1163 checkboxes and radio elements are returned individually.
1164 """
1165
1166 def __init__(self, form):
1167 self.form = form
1168
1169 def __repr__(self):
1170 return '<%s for form %s>' % (
1171 self.__class__.__name__,
1172 self.form._name())
1173
1174 ## FIXME: there should be more methods, and it's unclear if this is
1175 ## a dictionary-like object or list-like object
1176
1177 def __getitem__(self, name):
1178 fields = [field for field in self if field.name == name]
1179 if not fields:
1180 raise KeyError("No input element with the name %r" % name)
1181
1182 input_type = fields[0].get('type')
1183 if input_type == 'radio' and len(fields) > 1:
1184 group = RadioGroup(fields)
1185 group.name = name
1186 return group
1187 elif input_type == 'checkbox' and len(fields) > 1:
1188 group = CheckboxGroup(fields)
1189 group.name = name
1190 return group
1191 else:
1192 # I don't like throwing away elements like this
1193 return fields[0]
1194
1195 def __contains__(self, name):
1196 for field in self:
1197 if field.name == name:
1198 return True
1199 return False
1200
1201 def keys(self):
1202 """
1203 Returns all unique field names, in document order.
1204
1205 :return: A list of all unique field names.
1206 """
1207 names = []
1208 seen = {None}
1209 for el in self:
1210 name = el.name
1211 if name not in seen:
1212 names.append(name)
1213 seen.add(name)
1214 return names
1215
1216 def items(self):
1217 """
1218 Returns all fields with their names, similar to dict.items().
1219
1220 :return: A list of (name, field) tuples.
1221 """
1222 items = []
1223 seen = set()
1224 for el in self:
1225 name = el.name
1226 if name not in seen:
1227 seen.add(name)
1228 items.append((name, self[name]))
1229 return items
1230
1231 def __iter__(self):
1232 return self.form.iter('select', 'input', 'textarea')
1233
1234 def __len__(self):
1235 return sum(1 for _ in self)
1236
1237
1238class InputMixin:
1239 """
1240 Mix-in for all input elements (input, select, and textarea)
1241 """
1242 @property
1243 def name(self):
1244 """
1245 Get/set the name of the element
1246 """
1247 return self.get('name')
1248
1249 @name.setter
1250 def name(self, value):
1251 self.set('name', value)
1252
1253 @name.deleter
1254 def name(self):
1255 attrib = self.attrib
1256 if 'name' in attrib:
1257 del attrib['name']
1258
1259 def __repr__(self):
1260 type_name = getattr(self, 'type', None)
1261 if type_name:
1262 type_name = ' type=%r' % type_name
1263 else:
1264 type_name = ''
1265 return '<%s %x name=%r%s>' % (
1266 self.__class__.__name__, id(self), self.name, type_name)
1267
1268
1269class TextareaElement(InputMixin, HtmlElement):
1270 """
1271 ``<textarea>`` element. You can get the name with ``.name`` and
1272 get/set the value with ``.value``
1273 """
1274 @property
1275 def value(self):
1276 """
1277 Get/set the value (which is the contents of this element)
1278 """
1279 content = self.text or ''
1280 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1281 serialisation_method = 'xml'
1282 else:
1283 serialisation_method = 'html'
1284 for el in self:
1285 # it's rare that we actually get here, so let's not use ''.join()
1286 content += etree.tostring(
1287 el, method=serialisation_method, encoding='unicode')
1288 return content
1289
1290 @value.setter
1291 def value(self, value):
1292 del self[:]
1293 self.text = value
1294
1295 @value.deleter
1296 def value(self):
1297 self.text = ''
1298 del self[:]
1299
1300
1301HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1302
1303
1304class SelectElement(InputMixin, HtmlElement):
1305 """
1306 ``<select>`` element. You can get the name with ``.name``.
1307
1308 ``.value`` will be the value of the selected option, unless this
1309 is a multi-select element (``<select multiple>``), in which case
1310 it will be a set-like object. In either case ``.value_options``
1311 gives the possible values.
1312
1313 The boolean attribute ``.multiple`` shows if this is a
1314 multi-select.
1315 """
1316 @property
1317 def value(self):
1318 """
1319 Get/set the value of this select (the selected option).
1320
1321 If this is a multi-select, this is a set-like object that
1322 represents all the selected options.
1323 """
1324 if self.multiple:
1325 return MultipleSelectOptions(self)
1326 options = _options_xpath(self)
1327
1328 try:
1329 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1330 except StopIteration:
1331 try:
1332 selected_option = next(el for el in options if el.get('disabled') is None)
1333 except StopIteration:
1334 return None
1335 value = selected_option.get('value')
1336 if value is None:
1337 value = (selected_option.text or '').strip()
1338 return value
1339
1340 @value.setter
1341 def value(self, value):
1342 if self.multiple:
1343 if isinstance(value, str):
1344 raise TypeError("You must pass in a sequence")
1345 values = self.value
1346 values.clear()
1347 values.update(value)
1348 return
1349 checked_option = None
1350 if value is not None:
1351 for el in _options_xpath(self):
1352 opt_value = el.get('value')
1353 if opt_value is None:
1354 opt_value = (el.text or '').strip()
1355 if opt_value == value:
1356 checked_option = el
1357 break
1358 else:
1359 raise ValueError(
1360 "There is no option with the value of %r" % value)
1361 for el in _options_xpath(self):
1362 if 'selected' in el.attrib:
1363 del el.attrib['selected']
1364 if checked_option is not None:
1365 checked_option.set('selected', '')
1366
1367 @value.deleter
1368 def value(self):
1369 # FIXME: should del be allowed at all?
1370 if self.multiple:
1371 self.value.clear()
1372 else:
1373 self.value = None
1374
1375 @property
1376 def value_options(self):
1377 """
1378 All the possible values this select can have (the ``value``
1379 attribute of all the ``<option>`` elements.
1380 """
1381 options = []
1382 for el in _options_xpath(self):
1383 value = el.get('value')
1384 if value is None:
1385 value = (el.text or '').strip()
1386 options.append(value)
1387 return options
1388
1389 @property
1390 def multiple(self):
1391 """
1392 Boolean attribute: is there a ``multiple`` attribute on this element.
1393 """
1394 return 'multiple' in self.attrib
1395
1396 @multiple.setter
1397 def multiple(self, value):
1398 if value:
1399 self.set('multiple', '')
1400 elif 'multiple' in self.attrib:
1401 del self.attrib['multiple']
1402
1403
1404HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1405
1406
1407class MultipleSelectOptions(SetMixin):
1408 """
1409 Represents all the selected options in a ``<select multiple>`` element.
1410
1411 You can add to this set-like option to select an option, or remove
1412 to unselect the option.
1413 """
1414
1415 def __init__(self, select):
1416 self.select = select
1417
1418 @property
1419 def options(self):
1420 """
1421 Iterator of all the ``<option>`` elements.
1422 """
1423 return iter(_options_xpath(self.select))
1424
1425 def __iter__(self):
1426 for option in self.options:
1427 if 'selected' in option.attrib:
1428 opt_value = option.get('value')
1429 if opt_value is None:
1430 opt_value = (option.text or '').strip()
1431 yield opt_value
1432
1433 def add(self, item):
1434 for option in self.options:
1435 opt_value = option.get('value')
1436 if opt_value is None:
1437 opt_value = (option.text or '').strip()
1438 if opt_value == item:
1439 option.set('selected', '')
1440 break
1441 else:
1442 raise ValueError(
1443 "There is no option with the value %r" % item)
1444
1445 def remove(self, item):
1446 for option in self.options:
1447 opt_value = option.get('value')
1448 if opt_value is None:
1449 opt_value = (option.text or '').strip()
1450 if opt_value == item:
1451 if 'selected' in option.attrib:
1452 del option.attrib['selected']
1453 else:
1454 raise ValueError(
1455 "The option %r is not currently selected" % item)
1456 break
1457 else:
1458 raise ValueError(
1459 "There is not option with the value %r" % item)
1460
1461 def __repr__(self):
1462 return '<%s {%s} for select name=%r>' % (
1463 self.__class__.__name__,
1464 ', '.join([repr(v) for v in self]),
1465 self.select.name)
1466
1467
1468class RadioGroup(list):
1469 """
1470 This object represents several ``<input type=radio>`` elements
1471 that have the same name.
1472
1473 You can use this like a list, but also use the property
1474 ``.value`` to check/uncheck inputs. Also you can use
1475 ``.value_options`` to get the possible values.
1476 """
1477 @property
1478 def value(self):
1479 """
1480 Get/set the value, which checks the radio with that value (and
1481 unchecks any other value).
1482 """
1483 for el in self:
1484 if 'checked' in el.attrib:
1485 return el.get('value')
1486 return None
1487
1488 @value.setter
1489 def value(self, value):
1490 checked_option = None
1491 if value is not None:
1492 for el in self:
1493 if el.get('value') == value:
1494 checked_option = el
1495 break
1496 else:
1497 raise ValueError("There is no radio input with the value %r" % value)
1498 for el in self:
1499 if 'checked' in el.attrib:
1500 del el.attrib['checked']
1501 if checked_option is not None:
1502 checked_option.set('checked', '')
1503
1504 @value.deleter
1505 def value(self):
1506 self.value = None
1507
1508 @property
1509 def value_options(self):
1510 """
1511 Returns a list of all the possible values.
1512 """
1513 return [el.get('value') for el in self]
1514
1515 def __repr__(self):
1516 return '%s(%s)' % (
1517 self.__class__.__name__,
1518 list.__repr__(self))
1519
1520
1521class CheckboxGroup(list):
1522 """
1523 Represents a group of checkboxes (``<input type=checkbox>``) that
1524 have the same name.
1525
1526 In addition to using this like a list, the ``.value`` attribute
1527 returns a set-like object that you can add to or remove from to
1528 check and uncheck checkboxes. You can also use ``.value_options``
1529 to get the possible values.
1530 """
1531 @property
1532 def value(self):
1533 """
1534 Return a set-like object that can be modified to check or
1535 uncheck individual checkboxes according to their value.
1536 """
1537 return CheckboxValues(self)
1538
1539 @value.setter
1540 def value(self, value):
1541 values = self.value
1542 values.clear()
1543 if not hasattr(value, '__iter__'):
1544 raise ValueError(
1545 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1546 % (self[0].name, value))
1547 values.update(value)
1548
1549 @value.deleter
1550 def value(self):
1551 self.value.clear()
1552
1553 @property
1554 def value_options(self):
1555 """
1556 Returns a list of all the possible values.
1557 """
1558 return [el.get('value') for el in self]
1559
1560 def __repr__(self):
1561 return '%s(%s)' % (
1562 self.__class__.__name__, list.__repr__(self))
1563
1564
1565class CheckboxValues(SetMixin):
1566 """
1567 Represents the values of the checked checkboxes in a group of
1568 checkboxes with the same name.
1569 """
1570
1571 def __init__(self, group):
1572 self.group = group
1573
1574 def __iter__(self):
1575 return iter([
1576 el.get('value')
1577 for el in self.group
1578 if 'checked' in el.attrib])
1579
1580 def add(self, value):
1581 for el in self.group:
1582 if el.get('value') == value:
1583 el.set('checked', '')
1584 break
1585 else:
1586 raise KeyError("No checkbox with value %r" % value)
1587
1588 def remove(self, value):
1589 for el in self.group:
1590 if el.get('value') == value:
1591 if 'checked' in el.attrib:
1592 del el.attrib['checked']
1593 else:
1594 raise KeyError(
1595 "The checkbox with value %r was already unchecked" % value)
1596 break
1597 else:
1598 raise KeyError(
1599 "No checkbox with value %r" % value)
1600
1601 def __repr__(self):
1602 return '<%s {%s} for checkboxes name=%r>' % (
1603 self.__class__.__name__,
1604 ', '.join([repr(v) for v in self]),
1605 self.group.name)
1606
1607
1608class InputElement(InputMixin, HtmlElement):
1609 """
1610 Represents an ``<input>`` element.
1611
1612 You can get the type with ``.type`` (which is lower-cased and
1613 defaults to ``'text'``).
1614
1615 Also you can get and set the value with ``.value``
1616
1617 Checkboxes and radios have the attribute ``input.checkable ==
1618 True`` (for all others it is false) and a boolean attribute
1619 ``.checked``.
1620
1621 """
1622
1623 ## FIXME: I'm a little uncomfortable with the use of .checked
1624 @property
1625 def value(self):
1626 """
1627 Get/set the value of this element, using the ``value`` attribute.
1628
1629 Also, if this is a checkbox and it has no value, this defaults
1630 to ``'on'``. If it is a checkbox or radio that is not
1631 checked, this returns None.
1632 """
1633 if self.checkable:
1634 if self.checked:
1635 return self.get('value') or 'on'
1636 else:
1637 return None
1638 return self.get('value')
1639
1640 @value.setter
1641 def value(self, value):
1642 if self.checkable:
1643 if not value:
1644 self.checked = False
1645 else:
1646 self.checked = True
1647 if isinstance(value, str):
1648 self.set('value', value)
1649 else:
1650 self.set('value', value)
1651
1652 @value.deleter
1653 def value(self):
1654 if self.checkable:
1655 self.checked = False
1656 else:
1657 if 'value' in self.attrib:
1658 del self.attrib['value']
1659
1660 @property
1661 def type(self):
1662 """
1663 Return the type of this element (using the type attribute).
1664 """
1665 return self.get('type', 'text').lower()
1666
1667 @type.setter
1668 def type(self, value):
1669 self.set('type', value)
1670
1671 @property
1672 def checkable(self):
1673 """
1674 Boolean: can this element be checked?
1675 """
1676 return self.type in ('checkbox', 'radio')
1677
1678 @property
1679 def checked(self):
1680 """
1681 Boolean attribute to get/set the presence of the ``checked``
1682 attribute.
1683
1684 You can only use this on checkable input types.
1685 """
1686 if not self.checkable:
1687 raise AttributeError('Not a checkable input type')
1688 return 'checked' in self.attrib
1689
1690 @checked.setter
1691 def checked(self, value):
1692 if not self.checkable:
1693 raise AttributeError('Not a checkable input type')
1694 if value:
1695 self.set('checked', '')
1696 else:
1697 attrib = self.attrib
1698 if 'checked' in attrib:
1699 del attrib['checked']
1700
1701
1702HtmlElementClassLookup._default_element_classes['input'] = InputElement
1703
1704
1705class LabelElement(HtmlElement):
1706 """
1707 Represents a ``<label>`` element.
1708
1709 Label elements are linked to other elements with their ``for``
1710 attribute. You can access this element with ``label.for_element``.
1711 """
1712 @property
1713 def for_element(self):
1714 """
1715 Get/set the element this label points to. Return None if it
1716 can't be found.
1717 """
1718 id = self.get('for')
1719 if not id:
1720 return None
1721 return self.body.get_element_by_id(id)
1722
1723 @for_element.setter
1724 def for_element(self, other):
1725 id = other.get('id')
1726 if not id:
1727 raise TypeError(
1728 "Element %r has no id attribute" % other)
1729 self.set('for', id)
1730
1731 @for_element.deleter
1732 def for_element(self):
1733 attrib = self.attrib
1734 if 'id' in attrib:
1735 del attrib['id']
1736
1737
1738HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1739
1740
1741############################################################
1742## Serialization
1743############################################################
1744
1745def html_to_xhtml(html):
1746 """Convert all tags in an HTML tree to XHTML by moving them to the
1747 XHTML namespace.
1748 """
1749 try:
1750 html = html.getroot()
1751 except AttributeError:
1752 pass
1753 prefix = "{%s}" % XHTML_NAMESPACE
1754 for el in html.iter(etree.Element):
1755 tag = el.tag
1756 if tag[0] != '{':
1757 el.tag = prefix + tag
1758
1759
1760def xhtml_to_html(xhtml):
1761 """Convert all tags in an XHTML tree to HTML by removing their
1762 XHTML namespace.
1763 """
1764 try:
1765 xhtml = xhtml.getroot()
1766 except AttributeError:
1767 pass
1768 prefix = "{%s}" % XHTML_NAMESPACE
1769 prefix_len = len(prefix)
1770 for el in xhtml.iter(prefix + "*"):
1771 el.tag = el.tag[prefix_len:]
1772
1773
1774# This isn't a general match, but it's a match for what libxml2
1775# specifically serialises:
1776__str_replace_meta_content_type = re.compile(
1777 r'<meta http-equiv="Content-Type"[^>]*>').sub
1778__bytes_replace_meta_content_type = re.compile(
1779 br'<meta http-equiv="Content-Type"[^>]*>').sub
1780
1781
1782def tostring(doc, pretty_print=False, include_meta_content_type=False,
1783 encoding=None, method="html", with_tail=True, doctype=None):
1784 """Return an HTML string representation of the document.
1785
1786 Note: if include_meta_content_type is true this will create a
1787 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1788 regardless of the value of include_meta_content_type any existing
1789 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1790
1791 The ``encoding`` argument controls the output encoding (defaults to
1792 ASCII, with &#...; character references for any characters outside
1793 of ASCII). Note that you can pass the name ``'unicode'`` as
1794 ``encoding`` argument to serialise to a Unicode string.
1795
1796 The ``method`` argument defines the output method. It defaults to
1797 'html', but can also be 'xml' for xhtml output, or 'text' to
1798 serialise to plain text without markup.
1799
1800 To leave out the tail text of the top-level element that is being
1801 serialised, pass ``with_tail=False``.
1802
1803 The ``doctype`` option allows passing in a plain string that will
1804 be serialised before the XML tree. Note that passing in non
1805 well-formed content here will make the XML output non well-formed.
1806 Also, an existing doctype in the document tree will not be removed
1807 when serialising an ElementTree instance.
1808
1809 Example::
1810
1811 >>> from lxml import html
1812 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1813
1814 >>> html.tostring(root)
1815 b'<p>Hello<br>world!</p>'
1816 >>> html.tostring(root, method='html')
1817 b'<p>Hello<br>world!</p>'
1818
1819 >>> html.tostring(root, method='xml')
1820 b'<p>Hello<br/>world!</p>'
1821
1822 >>> html.tostring(root, method='text')
1823 b'Helloworld!'
1824
1825 >>> html.tostring(root, method='text', encoding='unicode')
1826 u'Helloworld!'
1827
1828 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1829 >>> html.tostring(root[0], method='text', encoding='unicode')
1830 u'Helloworld!TAIL'
1831
1832 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1833 u'Helloworld!'
1834
1835 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1836 >>> html.tostring(doc, method='html', encoding='unicode')
1837 u'<html><body><p>Hello<br>world!</p></body></html>'
1838
1839 >>> print(html.tostring(doc, method='html', encoding='unicode',
1840 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1841 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1842 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1843 <html><body><p>Hello<br>world!</p></body></html>
1844 """
1845 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1846 encoding=encoding, with_tail=with_tail,
1847 doctype=doctype)
1848 if method == 'html' and not include_meta_content_type:
1849 if isinstance(html, str):
1850 html = __str_replace_meta_content_type('', html)
1851 else:
1852 html = __bytes_replace_meta_content_type(b'', html)
1853 return html
1854
1855
1856tostring.__doc__ = __fix_docstring(tostring.__doc__)
1857
1858
1859def open_in_browser(doc, encoding=None):
1860 """
1861 Open the HTML document in a web browser, saving it to a temporary
1862 file to open it. Note that this does not delete the file after
1863 use. This is mainly meant for debugging.
1864 """
1865 import os
1866 import webbrowser
1867 import tempfile
1868 if not isinstance(doc, etree._ElementTree):
1869 doc = etree.ElementTree(doc)
1870 handle, fn = tempfile.mkstemp(suffix='.html')
1871 f = os.fdopen(handle, 'wb')
1872 try:
1873 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1874 finally:
1875 # we leak the file itself here, but we should at least close it
1876 f.close()
1877 url = 'file://' + fn.replace(os.path.sep, '/')
1878 print(url)
1879 webbrowser.open(url)
1880
1881
1882################################################################################
1883# configure Element class lookup
1884################################################################################
1885
1886class HTMLParser(etree.HTMLParser):
1887 """An HTML parser that is configured to return lxml.html Element
1888 objects.
1889 """
1890 def __init__(self, **kwargs):
1891 super().__init__(**kwargs)
1892 self.set_element_class_lookup(HtmlElementClassLookup())
1893
1894
1895class XHTMLParser(etree.XMLParser):
1896 """An XML parser that is configured to return lxml.html Element
1897 objects.
1898
1899 Note that this parser is not really XHTML aware unless you let it
1900 load a DTD that declares the HTML entities. To do this, make sure
1901 you have the XHTML DTDs installed in your catalogs, and create the
1902 parser like this::
1903
1904 >>> parser = XHTMLParser(load_dtd=True)
1905
1906 If you additionally want to validate the document, use this::
1907
1908 >>> parser = XHTMLParser(dtd_validation=True)
1909
1910 For catalog support, see http://www.xmlsoft.org/catalog.html.
1911 """
1912 def __init__(self, **kwargs):
1913 super().__init__(**kwargs)
1914 self.set_element_class_lookup(HtmlElementClassLookup())
1915
1916
1917def Element(*args, **kw):
1918 """Create a new HTML Element.
1919
1920 This can also be used for XHTML documents.
1921 """
1922 v = html_parser.makeelement(*args, **kw)
1923 return v
1924
1925
1926html_parser = HTMLParser()
1927xhtml_parser = XHTMLParser()