Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lxml/html/__init_

3# Redistribution and use in source and binary forms, with or without

4# modification, are permitted provided that the following conditions are

5# met:

7# 1. Redistributions of source code must retain the above copyright

8# notice, this list of conditions and the following disclaimer.

10# 2. Redistributions in binary form must reproduce the above copyright

11# notice, this list of conditions and the following disclaimer in

12# the documentation and/or other materials provided with the

13# distribution.

14#

15# 3. Neither the name of Ian Bicking nor the names of its contributors may

16# be used to endorse or promote products derived from this software

17# without specific prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

31"""The ``lxml.html`` tool set for HTML handling.

32"""

34from __future__ import absolute_import

36__all__ = [

37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',

38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',

39 'find_rel_links', 'find_class', 'make_links_absolute',

40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']

43import copy

44import sys

45import re

46from functools import partial

48try:

49 from collections.abc import MutableMapping, MutableSet

50except ImportError:

51 from collections import MutableMapping, MutableSet

53from .. import etree

54from . import defs

55from ._setmixin import SetMixin

57try:

58 from urlparse import urljoin

59except ImportError:

60 # Python 3

61 from urllib.parse import urljoin

63try:

64 unicode

65except NameError:

66 # Python 3

67 unicode = str

68try:

69 basestring

70except NameError:

71 # Python 3

72 basestring = (str, bytes)

75def __fix_docstring(s):

76 if not s:

77 return s

78 if sys.version_info[0] >= 3:

79 sub = re.compile(r"^(\s*)u'", re.M).sub

80 else:

81 sub = re.compile(r"^(\s*)b'", re.M).sub

82 return sub(r"\1'", s)

85XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

87_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",

88 namespaces={'x':XHTML_NAMESPACE})

89_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",

90 namespaces={'x':XHTML_NAMESPACE})

91_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",

92 namespaces={'x':XHTML_NAMESPACE})

93#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})

94_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")

95_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")

96_collect_string_content = etree.XPath("string()")

97_iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer

98_iter_css_imports = re.compile(r'@import "(.*?)"').finditer

99_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",

100 namespaces={'x':XHTML_NAMESPACE})

101_archive_re = re.compile(r'[^ ]+')

102_parse_meta_refresh_url = re.compile(

103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

104

105

106def _unquote_match(s, pos):

107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":

108 return s[1:-1], pos+1

109 else:

110 return s,pos

111

112

113def _transform_result(typ, result):

114 """Convert the result back into the input type.

115 """

116 if issubclass(typ, bytes):

117 return tostring(result, encoding='utf-8')

118 elif issubclass(typ, unicode):

119 return tostring(result, encoding='unicode')

120 else:

121 return result

122

123

124def _nons(tag):

125 if isinstance(tag, basestring):

126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:

127 return tag.split('}')[-1]

128 return tag

129

130

131class Classes(MutableSet):

132 """Provides access to an element's class attribute as a set-like collection.

133 Usage::

134

135 >>> el = fromstring('<p class="hidden large">Text</p>')

136 >>> classes = el.classes # or: classes = Classes(el.attrib)

137 >>> classes |= ['block', 'paragraph']

138 >>> el.get('class')

139 'hidden large block paragraph'

140 >>> classes.toggle('hidden')

141 False

142 >>> el.get('class')

143 'large block paragraph'

144 >>> classes -= ('some', 'classes', 'block')

145 >>> el.get('class')

146 'large paragraph'

147 """

148 def __init__(self, attributes):

149 self._attributes = attributes

150 self._get_class_value = partial(attributes.get, 'class', '')

151

152 def add(self, value):

153 """

154 Add a class.

155

156 This has no effect if the class is already present.

157 """

158 if not value or re.search(r'\s', value):

159 raise ValueError("Invalid class name: %r" % value)

160 classes = self._get_class_value().split()

161 if value in classes:

162 return

163 classes.append(value)

164 self._attributes['class'] = ' '.join(classes)

165

166 def discard(self, value):

167 """

168 Remove a class if it is currently present.

169

170 If the class is not present, do nothing.

171 """

172 if not value or re.search(r'\s', value):

173 raise ValueError("Invalid class name: %r" % value)

174 classes = [name for name in self._get_class_value().split()

175 if name != value]

176 if classes:

177 self._attributes['class'] = ' '.join(classes)

178 elif 'class' in self._attributes:

179 del self._attributes['class']

180

181 def remove(self, value):

182 """

183 Remove a class; it must currently be present.

184

185 If the class is not present, raise a KeyError.

186 """

187 if not value or re.search(r'\s', value):

188 raise ValueError("Invalid class name: %r" % value)

189 super(Classes, self).remove(value)

190

191 def __contains__(self, name):

192 classes = self._get_class_value()

193 return name in classes and name in classes.split()

194

195 def __iter__(self):

196 return iter(self._get_class_value().split())

197

198 def __len__(self):

199 return len(self._get_class_value().split())

200

201 # non-standard methods

202

203 def update(self, values):

204 """

205 Add all names from 'values'.

206 """

207 classes = self._get_class_value().split()

208 extended = False

209 for value in values:

210 if value not in classes:

211 classes.append(value)

212 extended = True

213 if extended:

214 self._attributes['class'] = ' '.join(classes)

215

216 def toggle(self, value):

217 """

218 Add a class name if it isn't there yet, or remove it if it exists.

219

220 Returns true if the class was added (and is now enabled) and

221 false if it was removed (and is now disabled).

222 """

223 if not value or re.search(r'\s', value):

224 raise ValueError("Invalid class name: %r" % value)

225 classes = self._get_class_value().split()

226 try:

227 classes.remove(value)

228 enabled = False

229 except ValueError:

230 classes.append(value)

231 enabled = True

232 if classes:

233 self._attributes['class'] = ' '.join(classes)

234 else:

235 del self._attributes['class']

236 return enabled

237

238

239class HtmlMixin(object):

240

241 def set(self, key, value=None):

242 """set(self, key, value=None)

243

244 Sets an element attribute. If no value is provided, or if the value is None,

245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"

246 for ``form.set('novalidate')``.

247 """

248 super(HtmlMixin, self).set(key, value)

249

250 @property

251 def classes(self):

252 """

253 A set-like wrapper around the 'class' attribute.

254 """

255 return Classes(self.attrib)

256

257 @classes.setter

258 def classes(self, classes):

259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.

260 value = classes._get_class_value()

261 if value:

262 self.set('class', value)

263 elif self.get('class') is not None:

264 del self.attrib['class']

265

266 @property

267 def base_url(self):

268 """

269 Returns the base URL, given when the page was parsed.

270

271 Use with ``urlparse.urljoin(el.base_url, href)`` to get

272 absolute URLs.

273 """

274 return self.getroottree().docinfo.URL

275

276 @property

277 def forms(self):

278 """

279 Return a list of all the forms

280 """

281 return _forms_xpath(self)

282

283 @property

284 def body(self):

285 """

286 Return the <body> element. Can be called from a child element

287 to get the document's head.

288 """

289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

290

291 @property

292 def head(self):

293 """

294 Returns the <head> element. Can be called from a child

295 element to get the document's head.

296 """

297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

298

299 @property

300 def label(self):

301 """

302 Get or set any <label> element associated with this element.

303 """

304 id = self.get('id')

305 if not id:

306 return None

307 result = _label_xpath(self, id=id)

308 if not result:

309 return None

310 else:

311 return result[0]

312

313 @label.setter

314 def label(self, label):

315 id = self.get('id')

316 if not id:

317 raise TypeError(

318 "You cannot set a label for an element (%r) that has no id"

319 % self)

320 if _nons(label.tag) != 'label':

321 raise TypeError(

322 "You can only assign label to a label element (not %r)"

323 % label)

324 label.set('for', id)

325

326 @label.deleter

327 def label(self):

328 label = self.label

329 if label is not None:

330 del label.attrib['for']

331

332 def drop_tree(self):

333 """

334 Removes this element from the tree, including its children and

335 text. The tail text is joined to the previous element or

336 parent.

337 """

338 parent = self.getparent()

339 assert parent is not None

340 if self.tail:

341 previous = self.getprevious()

342 if previous is None:

343 parent.text = (parent.text or '') + self.tail

344 else:

345 previous.tail = (previous.tail or '') + self.tail

346 parent.remove(self)

347

348 def drop_tag(self):

349 """

350 Remove the tag, but not its children or text. The children and text

351 are merged into the parent.

352

353 Example::

354

355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')

356 >>> h.find('.//b').drop_tag()

357 >>> print(tostring(h, encoding='unicode'))

358 <div>Hello World!</div>

359 """

360 parent = self.getparent()

361 assert parent is not None

362 previous = self.getprevious()

363 if self.text and isinstance(self.tag, basestring):

364 # not a Comment, etc.

365 if previous is None:

366 parent.text = (parent.text or '') + self.text

367 else:

368 previous.tail = (previous.tail or '') + self.text

369 if self.tail:

370 if len(self):

371 last = self[-1]

372 last.tail = (last.tail or '') + self.tail

373 elif previous is None:

374 parent.text = (parent.text or '') + self.tail

375 else:

376 previous.tail = (previous.tail or '') + self.tail

377 index = parent.index(self)

378 parent[index:index+1] = self[:]

379

380 def find_rel_links(self, rel):

381 """

382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.

383 """

384 rel = rel.lower()

385 return [el for el in _rel_links_xpath(self)

386 if el.get('rel').lower() == rel]

387

388 def find_class(self, class_name):

389 """

390 Find any elements with the given class name.

391 """

392 return _class_xpath(self, class_name=class_name)

393

394 def get_element_by_id(self, id, *default):

395 """

396 Get the first element in a document with the given id. If none is

397 found, return the default argument if provided or raise KeyError

398 otherwise.

399

400 Note that there can be more than one element with the same id,

401 and this isn't uncommon in HTML documents found in the wild.

402 Browsers return only the first match, and this function does

403 the same.

404 """

405 try:

406 # FIXME: should this check for multiple matches?

407 # browsers just return the first one

408 return _id_xpath(self, id=id)[0]

409 except IndexError:

410 if default:

411 return default[0]

412 else:

413 raise KeyError(id)

414

415 def text_content(self):

416 """

417 Return the text content of the tag (and the text in any children).

418 """

419 return _collect_string_content(self)

420

421 def cssselect(self, expr, translator='html'):

422 """

423 Run the CSS expression on this element and its children,

424 returning a list of the results.

425

426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)

427 -- note that pre-compiling the expression can provide a substantial

428 speedup.

429 """

430 # Do the import here to make the dependency optional.

431 from lxml.cssselect import CSSSelector

432 return CSSSelector(expr, translator=translator)(self)

433

434 ########################################

435 ## Link functions

436 ########################################

437

438 def make_links_absolute(self, base_url=None, resolve_base_href=True,

439 handle_failures=None):

440 """

441 Make all links in the document absolute, given the

442 ``base_url`` for the document (the full URL where the document

443 came from), or if no ``base_url`` is given, then the ``.base_url``

444 of the document.

445

446 If ``resolve_base_href`` is true, then any ``<base href>``

447 tags in the document are used *and* removed from the document.

448 If it is false then any such tag is ignored.

449

450 If ``handle_failures`` is None (default), a failure to process

451 a URL will abort the processing. If set to 'ignore', errors

452 are ignored. If set to 'discard', failing URLs will be removed.

453 """

454 if base_url is None:

455 base_url = self.base_url

456 if base_url is None:

457 raise TypeError(

458 "No base_url given, and the document has no base_url")

459 if resolve_base_href:

460 self.resolve_base_href()

461

462 if handle_failures == 'ignore':

463 def link_repl(href):

464 try:

465 return urljoin(base_url, href)

466 except ValueError:

467 return href

468 elif handle_failures == 'discard':

469 def link_repl(href):

470 try:

471 return urljoin(base_url, href)

472 except ValueError:

473 return None

474 elif handle_failures is None:

475 def link_repl(href):

476 return urljoin(base_url, href)

477 else:

478 raise ValueError(

479 "unexpected value for handle_failures: %r" % handle_failures)

480

481 self.rewrite_links(link_repl)

482

483 def resolve_base_href(self, handle_failures=None):

484 """

485 Find any ``<base href>`` tag in the document, and apply its

486 values to all links found in the document. Also remove the

487 tag once it has been applied.

488

489 If ``handle_failures`` is None (default), a failure to process

490 a URL will abort the processing. If set to 'ignore', errors

491 are ignored. If set to 'discard', failing URLs will be removed.

492 """

493 base_href = None

494 basetags = self.xpath('//base[@href]|//x:base[@href]',

495 namespaces={'x': XHTML_NAMESPACE})

496 for b in basetags:

497 base_href = b.get('href')

498 b.drop_tree()

499 if not base_href:

500 return

501 self.make_links_absolute(base_href, resolve_base_href=False,

502 handle_failures=handle_failures)

503

504 def iterlinks(self):

505 """

506 Yield (element, attribute, link, pos), where attribute may be None

507 (indicating the link is in the text). ``pos`` is the position

508 where the link occurs; often 0, but sometimes something else in

509 the case of links in stylesheets or style tags.

510

511 Note: <base href> is *not* taken into account in any way. The

512 link you get is exactly the link in the document.

513

514 Note: multiple links inside of a single text string or

515 attribute value are returned in reversed order. This makes it

516 possible to replace or delete them from the text string value

517 based on their reported text positions. Otherwise, a

518 modification at one text position can change the positions of

519 links reported later on.

520 """

521 link_attrs = defs.link_attrs

522 for el in self.iter(etree.Element):

523 attribs = el.attrib

524 tag = _nons(el.tag)

525 if tag == 'object':

526 codebase = None

527 ## <object> tags have attributes that are relative to

528 ## codebase

529 if 'codebase' in attribs:

530 codebase = el.get('codebase')

531 yield (el, 'codebase', codebase, 0)

532 for attrib in ('classid', 'data'):

533 if attrib in attribs:

534 value = el.get(attrib)

535 if codebase is not None:

536 value = urljoin(codebase, value)

537 yield (el, attrib, value, 0)

538 if 'archive' in attribs:

539 for match in _archive_re.finditer(el.get('archive')):

540 value = match.group(0)

541 if codebase is not None:

542 value = urljoin(codebase, value)

543 yield (el, 'archive', value, match.start())

544 else:

545 for attrib in link_attrs:

546 if attrib in attribs:

547 yield (el, attrib, attribs[attrib], 0)

548 if tag == 'meta':

549 http_equiv = attribs.get('http-equiv', '').lower()

550 if http_equiv == 'refresh':

551 content = attribs.get('content', '')

552 match = _parse_meta_refresh_url(content)

553 url = (match.group('url') if match else content).strip()

554 # unexpected content means the redirect won't work, but we might

555 # as well be permissive and return the entire string.

556 if url:

557 url, pos = _unquote_match(

558 url, match.start('url') if match else content.find(url))

559 yield (el, 'content', url, pos)

560 elif tag == 'param':

561 valuetype = el.get('valuetype') or ''

562 if valuetype.lower() == 'ref':

563 ## FIXME: while it's fine we *find* this link,

564 ## according to the spec we aren't supposed to

565 ## actually change the value, including resolving

566 ## it. It can also still be a link, even if it

567 ## doesn't have a valuetype="ref" (which seems to be the norm)

568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype

569 yield (el, 'value', el.get('value'), 0)

570 elif tag == 'style' and el.text:

571 urls = [

572 # (start_pos, url)

573 _unquote_match(match.group(1), match.start(1))[::-1]

574 for match in _iter_css_urls(el.text)

575 ] + [

576 (match.start(1), match.group(1))

577 for match in _iter_css_imports(el.text)

578 ]

579 if urls:

580 # sort by start pos to bring both match sets back into order

581 # and reverse the list to report correct positions despite

582 # modifications

583 urls.sort(reverse=True)

584 for start, url in urls:

585 yield (el, None, url, start)

586 if 'style' in attribs:

587 urls = list(_iter_css_urls(attribs['style']))

588 if urls:

589 # return in reversed order to simplify in-place modifications

590 for match in urls[::-1]:

591 url, start = _unquote_match(match.group(1), match.start(1))

592 yield (el, 'style', url, start)

593

594 def rewrite_links(self, link_repl_func, resolve_base_href=True,

595 base_href=None):

596 """

597 Rewrite all the links in the document. For each link

598 ``link_repl_func(link)`` will be called, and the return value

599 will replace the old link.

600

601 Note that links may not be absolute (unless you first called

602 ``make_links_absolute()``), and may be internal (e.g.,

603 ``'#anchor'``). They can also be values like

604 ``'mailto:email'`` or ``'javascript:expr'``.

605

606 If you give ``base_href`` then all links passed to

607 ``link_repl_func()`` will take that into account.

608

609 If the ``link_repl_func`` returns None, the attribute or

610 tag text will be removed completely.

611 """

612 if base_href is not None:

613 # FIXME: this can be done in one pass with a wrapper

614 # around link_repl_func

615 self.make_links_absolute(

616 base_href, resolve_base_href=resolve_base_href)

617 elif resolve_base_href:

618 self.resolve_base_href()

619

620 for el, attrib, link, pos in self.iterlinks():

621 new_link = link_repl_func(link.strip())

622 if new_link == link:

623 continue

624 if new_link is None:

625 # Remove the attribute or element content

626 if attrib is None:

627 el.text = ''

628 else:

629 del el.attrib[attrib]

630 continue

631

632 if attrib is None:

633 new = el.text[:pos] + new_link + el.text[pos+len(link):]

634 el.text = new

635 else:

636 cur = el.get(attrib)

637 if not pos and len(cur) == len(link):

638 new = new_link # most common case

639 else:

640 new = cur[:pos] + new_link + cur[pos+len(link):]

641 el.set(attrib, new)

642

643

644class _MethodFunc(object):

645 """

646 An object that represents a method on an element as a function;

647 the function takes either an element or an HTML string. It

648 returns whatever the function normally returns, or if the function

649 works in-place (and so returns None) it returns a serialized form

650 of the resulting document.

651 """

652 def __init__(self, name, copy=False, source_class=HtmlMixin):

653 self.name = name

654 self.copy = copy

655 self.__doc__ = getattr(source_class, self.name).__doc__

656 def __call__(self, doc, *args, **kw):

657 result_type = type(doc)

658 if isinstance(doc, basestring):

659 if 'copy' in kw:

660 raise TypeError(

661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)

662 doc = fromstring(doc, **kw)

663 else:

664 if 'copy' in kw:

665 make_a_copy = kw.pop('copy')

666 else:

667 make_a_copy = self.copy

668 if make_a_copy:

669 doc = copy.deepcopy(doc)

670 meth = getattr(doc, self.name)

671 result = meth(*args, **kw)

672 # FIXME: this None test is a bit sloppy

673 if result is None:

674 # Then return what we got in

675 return _transform_result(result_type, doc)

676 else:

677 return result

678

679

680find_rel_links = _MethodFunc('find_rel_links', copy=False)

681find_class = _MethodFunc('find_class', copy=False)

682make_links_absolute = _MethodFunc('make_links_absolute', copy=True)

683resolve_base_href = _MethodFunc('resolve_base_href', copy=True)

684iterlinks = _MethodFunc('iterlinks', copy=False)

685rewrite_links = _MethodFunc('rewrite_links', copy=True)

686

687

688class HtmlComment(HtmlMixin, etree.CommentBase):

689 pass

690

691

692class HtmlElement(HtmlMixin, etree.ElementBase):

693 pass

694

695

696class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):

697 pass

698

699

700class HtmlEntity(HtmlMixin, etree.EntityBase):

701 pass

702

703

704class HtmlElementClassLookup(etree.CustomElementClassLookup):

705 """A lookup scheme for HTML Element classes.

706

707 To create a lookup instance with different Element classes, pass a tag

708 name mapping of Element classes in the ``classes`` keyword argument and/or

709 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.

710 The special key '*' denotes a Mixin class that should be mixed into all

711 Element classes.

712 """

713 _default_element_classes = {}

714

715 def __init__(self, classes=None, mixins=None):

716 etree.CustomElementClassLookup.__init__(self)

717 if classes is None:

718 classes = self._default_element_classes.copy()

719 if mixins:

720 mixers = {}

721 for name, value in mixins:

722 if name == '*':

723 for n in classes.keys():

724 mixers.setdefault(n, []).append(value)

725 else:

726 mixers.setdefault(name, []).append(value)

727 for name, mix_bases in mixers.items():

728 cur = classes.get(name, HtmlElement)

729 bases = tuple(mix_bases + [cur])

730 classes[name] = type(cur.__name__, bases, {})

731 self._element_classes = classes

732

733 def lookup(self, node_type, document, namespace, name):

734 if node_type == 'element':

735 return self._element_classes.get(name.lower(), HtmlElement)

736 elif node_type == 'comment':

737 return HtmlComment

738 elif node_type == 'PI':

739 return HtmlProcessingInstruction

740 elif node_type == 'entity':

741 return HtmlEntity

742 # Otherwise normal lookup

743 return None

744

745

746################################################################################

747# parsing

748################################################################################

749

750_looks_like_full_html_unicode = re.compile(

751 unicode(r'^\s*<(?:html|!doctype)'), re.I).match

752_looks_like_full_html_bytes = re.compile(

753 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match

754

755

756def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

757 if parser is None:

758 parser = html_parser

759 value = etree.fromstring(html, parser, **kw)

760 if value is None:

761 raise etree.ParserError(

762 "Document is empty")

763 if ensure_head_body and value.find('head') is None:

764 value.insert(0, Element('head'))

765 if ensure_head_body and value.find('body') is None:

766 value.append(Element('body'))

767 return value

768

769

770def fragments_fromstring(html, no_leading_text=False, base_url=None,

771 parser=None, **kw):

772 """Parses several HTML elements, returning a list of elements.

773

774 The first item in the list may be a string.

775 If no_leading_text is true, then it will be an error if there is

776 leading text, and it will always be a list of only elements.

777

778 base_url will set the document's base_url attribute

779 (and the tree's docinfo.URL).

780 """

781 if parser is None:

782 parser = html_parser

783 # FIXME: check what happens when you give html with a body, head, etc.

784 if isinstance(html, bytes):

785 if not _looks_like_full_html_bytes(html):

786 # can't use %-formatting in early Py3 versions

787 html = ('<html><body>'.encode('ascii') + html +

788 '</body></html>'.encode('ascii'))

789 else:

790 if not _looks_like_full_html_unicode(html):

791 html = '<html><body>%s</body></html>' % html

792 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

793 assert _nons(doc.tag) == 'html'

794 bodies = [e for e in doc if _nons(e.tag) == 'body']

795 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))

796 body = bodies[0]

797 elements = []

798 if no_leading_text and body.text and body.text.strip():

799 raise etree.ParserError(

800 "There is leading text: %r" % body.text)

801 if body.text and body.text.strip():

802 elements.append(body.text)

803 elements.extend(body)

804 # FIXME: removing the reference to the parent artificial document

805 # would be nice

806 return elements

807

808

809def fragment_fromstring(html, create_parent=False, base_url=None,

810 parser=None, **kw):

811 """

812 Parses a single HTML element; it is an error if there is more than

813 one element, or if anything but whitespace precedes or follows the

814 element.

815

816 If ``create_parent`` is true (or is a tag name) then a parent node

817 will be created to encapsulate the HTML in a single element. In this

818 case, leading or trailing text is also allowed, as are multiple elements

819 as result of the parsing.

820

821 Passing a ``base_url`` will set the document's ``base_url`` attribute

822 (and the tree's docinfo.URL).

823 """

824 if parser is None:

825 parser = html_parser

826

827 accept_leading_text = bool(create_parent)

828

829 elements = fragments_fromstring(

830 html, parser=parser, no_leading_text=not accept_leading_text,

831 base_url=base_url, **kw)

832

833 if create_parent:

834 if not isinstance(create_parent, basestring):

835 create_parent = 'div'

836 new_root = Element(create_parent)

837 if elements:

838 if isinstance(elements[0], basestring):

839 new_root.text = elements[0]

840 del elements[0]

841 new_root.extend(elements)

842 return new_root

843

844 if not elements:

845 raise etree.ParserError('No elements found')

846 if len(elements) > 1:

847 raise etree.ParserError(

848 "Multiple elements found (%s)"

849 % ', '.join([_element_name(e) for e in elements]))

850 el = elements[0]

851 if el.tail and el.tail.strip():

852 raise etree.ParserError(

853 "Element followed by text: %r" % el.tail)

854 el.tail = None

855 return el

856

857

858def fromstring(html, base_url=None, parser=None, **kw):

859 """

860 Parse the html, returning a single element/document.

861

862 This tries to minimally parse the chunk of text, without knowing if it

863 is a fragment or a document.

864

865 base_url will set the document's base_url attribute (and the tree's docinfo.URL)

866 """

867 if parser is None:

868 parser = html_parser

869 if isinstance(html, bytes):

870 is_full_html = _looks_like_full_html_bytes(html)

871 else:

872 is_full_html = _looks_like_full_html_unicode(html)

873 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

874 if is_full_html:

875 return doc

876 # otherwise, lets parse it out...

877 bodies = doc.findall('body')

878 if not bodies:

879 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)

880 if bodies:

881 body = bodies[0]

882 if len(bodies) > 1:

883 # Somehow there are multiple bodies, which is bad, but just

884 # smash them into one body

885 for other_body in bodies[1:]:

886 if other_body.text:

887 if len(body):

888 body[-1].tail = (body[-1].tail or '') + other_body.text

889 else:

890 body.text = (body.text or '') + other_body.text

891 body.extend(other_body)

892 # We'll ignore tail

893 # I guess we are ignoring attributes too

894 other_body.drop_tree()

895 else:

896 body = None

897 heads = doc.findall('head')

898 if not heads:

899 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)

900 if heads:

901 # Well, we have some sort of structure, so lets keep it all

902 head = heads[0]

903 if len(heads) > 1:

904 for other_head in heads[1:]:

905 head.extend(other_head)

906 # We don't care about text or tail in a head

907 other_head.drop_tree()

908 return doc

909 if body is None:

910 return doc

911 if (len(body) == 1 and (not body.text or not body.text.strip())

912 and (not body[-1].tail or not body[-1].tail.strip())):

913 # The body has just one element, so it was probably a single

914 # element passed in

915 return body[0]

916 # Now we have a body which represents a bunch of tags which have the

917 # content that was passed in. We will create a fake container, which

918 # is the body tag, except <body> implies too much structure.

919 if _contains_block_level_tag(body):

920 body.tag = 'div'

921 else:

922 body.tag = 'span'

923 return body

924

925

926def parse(filename_or_url, parser=None, base_url=None, **kw):

927 """

928 Parse a filename, URL, or file-like object into an HTML document

929 tree. Note: this returns a tree, not an element. Use

930 ``parse(...).getroot()`` to get the document root.

931

932 You can override the base URL with the ``base_url`` keyword. This

933 is most useful when parsing from a file-like object.

934 """

935 if parser is None:

936 parser = html_parser

937 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

938

939

940def _contains_block_level_tag(el):

941 # FIXME: I could do this with XPath, but would that just be

942 # unnecessarily slow?

943 for el in el.iter(etree.Element):

944 if _nons(el.tag) in defs.block_tags:

945 return True

946 return False

947

948

949def _element_name(el):

950 if isinstance(el, etree.CommentBase):

951 return 'comment'

952 elif isinstance(el, basestring):

953 return 'string'

954 else:

955 return _nons(el.tag)

956

957

958################################################################################

959# form handling

960################################################################################

961

962class FormElement(HtmlElement):

963 """

964 Represents a <form> element.

965 """

966

967 @property

968 def inputs(self):

969 """

970 Returns an accessor for all the input elements in the form.

971

972 See `InputGetter` for more information about the object.

973 """

974 return InputGetter(self)

975

976 @property

977 def fields(self):

978 """

979 Dictionary-like object that represents all the fields in this

980 form. You can set values in this dictionary to effect the

981 form.

982 """

983 return FieldsDict(self.inputs)

984

985 @fields.setter

986 def fields(self, value):

987 fields = self.fields

988 prev_keys = fields.keys()

989 for key, value in value.items():

990 if key in prev_keys:

991 prev_keys.remove(key)

992 fields[key] = value

993 for key in prev_keys:

994 if key is None:

995 # Case of an unnamed input; these aren't really

996 # expressed in form_values() anyway.

997 continue

998 fields[key] = None

999

1000 def _name(self):

1001 if self.get('name'):

1002 return self.get('name')

1003 elif self.get('id'):

1004 return '#' + self.get('id')

1005 iter_tags = self.body.iter

1006 forms = list(iter_tags('form'))

1007 if not forms:

1008 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))

1009 return str(forms.index(self))

1010

1011 def form_values(self):

1012 """

1013 Return a list of tuples of the field values for the form.

1014 This is suitable to be passed to ``urllib.urlencode()``.

1015 """

1016 results = []

1017 for el in self.inputs:

1018 name = el.name

1019 if not name or 'disabled' in el.attrib:

1020 continue

1021 tag = _nons(el.tag)

1022 if tag == 'textarea':

1023 results.append((name, el.value))

1024 elif tag == 'select':

1025 value = el.value

1026 if el.multiple:

1027 for v in value:

1028 results.append((name, v))

1029 elif value is not None:

1030 results.append((name, el.value))

1031 else:

1032 assert tag == 'input', (

1033 "Unexpected tag: %r" % el)

1034 if el.checkable and not el.checked:

1035 continue

1036 if el.type in ('submit', 'image', 'reset', 'file'):

1037 continue

1038 value = el.value

1039 if value is not None:

1040 results.append((name, el.value))

1041 return results

1042

1043 @property

1044 def action(self):

1045 """

1046 Get/set the form's ``action`` attribute.

1047 """

1048 base_url = self.base_url

1049 action = self.get('action')

1050 if base_url and action is not None:

1051 return urljoin(base_url, action)

1052 else:

1053 return action

1054

1055 @action.setter

1056 def action(self, value):

1057 self.set('action', value)

1058

1059 @action.deleter

1060 def action(self):

1061 attrib = self.attrib

1062 if 'action' in attrib:

1063 del attrib['action']

1064

1065 @property

1066 def method(self):

1067 """

1068 Get/set the form's method. Always returns a capitalized

1069 string, and defaults to ``'GET'``

1070 """

1071 return self.get('method', 'GET').upper()

1072

1073 @method.setter

1074 def method(self, value):

1075 self.set('method', value.upper())

1076

1077

1078HtmlElementClassLookup._default_element_classes['form'] = FormElement

1079

1080

1081def submit_form(form, extra_values=None, open_http=None):

1082 """

1083 Helper function to submit a form. Returns a file-like object, as from

1084 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,

1085 which shows the URL if there were any redirects.

1086

1087 You can use this like::

1088

1089 form = doc.forms[0]

1090 form.inputs['foo'].value = 'bar' # etc

1091 response = form.submit()

1092 doc = parse(response)

1093 doc.make_links_absolute(response.geturl())

1094

1095 To change the HTTP requester, pass a function as ``open_http`` keyword

1096 argument that opens the URL for you. The function must have the following

1097 signature::

1098

1099 open_http(method, URL, values)

1100

1101 The action is one of 'GET' or 'POST', the URL is the target URL as a

1102 string, and the values are a sequence of ``(name, value)`` tuples with the

1103 form data.

1104 """

1105 values = form.form_values()

1106 if extra_values:

1107 if hasattr(extra_values, 'items'):

1108 extra_values = extra_values.items()

1109 values.extend(extra_values)

1110 if open_http is None:

1111 open_http = open_http_urllib

1112 if form.action:

1113 url = form.action

1114 else:

1115 url = form.base_url

1116 return open_http(form.method, url, values)

1117

1118

1119def open_http_urllib(method, url, values):

1120 if not url:

1121 raise ValueError("cannot submit, no URL provided")

1122 ## FIXME: should test that it's not a relative URL or something

1123 try:

1124 from urllib import urlencode, urlopen

1125 except ImportError: # Python 3

1126 from urllib.request import urlopen

1127 from urllib.parse import urlencode

1128 if method == 'GET':

1129 if '?' in url:

1130 url += '&'

1131 else:

1132 url += '?'

1133 url += urlencode(values)

1134 data = None

1135 else:

1136 data = urlencode(values)

1137 if not isinstance(data, bytes):

1138 data = data.encode('ASCII')

1139 return urlopen(url, data)

1140

1141

1142class FieldsDict(MutableMapping):

1143

1144 def __init__(self, inputs):

1145 self.inputs = inputs

1146 def __getitem__(self, item):

1147 return self.inputs[item].value

1148 def __setitem__(self, item, value):

1149 self.inputs[item].value = value

1150 def __delitem__(self, item):

1151 raise KeyError(

1152 "You cannot remove keys from ElementDict")

1153 def keys(self):

1154 return self.inputs.keys()

1155 def __contains__(self, item):

1156 return item in self.inputs

1157 def __iter__(self):

1158 return iter(self.inputs.keys())

1159 def __len__(self):

1160 return len(self.inputs)

1161

1162 def __repr__(self):

1163 return '<%s for form %s>' % (

1164 self.__class__.__name__,

1165 self.inputs.form._name())

1166

1167

1168class InputGetter(object):

1169

1170 """

1171 An accessor that represents all the input fields in a form.

1172

1173 You can get fields by name from this, with

1174 ``form.inputs['field_name']``. If there are a set of checkboxes

1175 with the same name, they are returned as a list (a `CheckboxGroup`

1176 which also allows value setting). Radio inputs are handled

1177 similarly. Use ``.keys()`` and ``.items()`` to process all fields

1178 in this way.

1179

1180 You can also iterate over this to get all input elements. This

1181 won't return the same thing as if you get all the names, as

1182 checkboxes and radio elements are returned individually.

1183 """

1184

1185 def __init__(self, form):

1186 self.form = form

1187

1188 def __repr__(self):

1189 return '<%s for form %s>' % (

1190 self.__class__.__name__,

1191 self.form._name())

1192

1193 ## FIXME: there should be more methods, and it's unclear if this is

1194 ## a dictionary-like object or list-like object

1195

1196 def __getitem__(self, name):

1197 fields = [field for field in self if field.name == name]

1198 if not fields:

1199 raise KeyError("No input element with the name %r" % name)

1200

1201 input_type = fields[0].get('type')

1202 if input_type == 'radio' and len(fields) > 1:

1203 group = RadioGroup(fields)

1204 group.name = name

1205 return group

1206 elif input_type == 'checkbox' and len(fields) > 1:

1207 group = CheckboxGroup(fields)

1208 group.name = name

1209 return group

1210 else:

1211 # I don't like throwing away elements like this

1212 return fields[0]

1213

1214 def __contains__(self, name):

1215 for field in self:

1216 if field.name == name:

1217 return True

1218 return False

1219

1220 def keys(self):

1221 """

1222 Returns all unique field names, in document order.

1223

1224 :return: A list of all unique field names.

1225 """

1226 names = []

1227 seen = {None}

1228 for el in self:

1229 name = el.name

1230 if name not in seen:

1231 names.append(name)

1232 seen.add(name)

1233 return names

1234

1235 def items(self):

1236 """

1237 Returns all fields with their names, similar to dict.items().

1238

1239 :return: A list of (name, field) tuples.

1240 """

1241 items = []

1242 seen = set()

1243 for el in self:

1244 name = el.name

1245 if name not in seen:

1246 seen.add(name)

1247 items.append((name, self[name]))

1248 return items

1249

1250 def __iter__(self):

1251 return self.form.iter('select', 'input', 'textarea')

1252

1253 def __len__(self):

1254 return sum(1 for _ in self)

1255

1256

1257class InputMixin(object):

1258 """

1259 Mix-in for all input elements (input, select, and textarea)

1260 """

1261 @property

1262 def name(self):

1263 """

1264 Get/set the name of the element

1265 """

1266 return self.get('name')

1267

1268 @name.setter

1269 def name(self, value):

1270 self.set('name', value)

1271

1272 @name.deleter

1273 def name(self):

1274 attrib = self.attrib

1275 if 'name' in attrib:

1276 del attrib['name']

1277

1278 def __repr__(self):

1279 type_name = getattr(self, 'type', None)

1280 if type_name:

1281 type_name = ' type=%r' % type_name

1282 else:

1283 type_name = ''

1284 return '<%s %x name=%r%s>' % (

1285 self.__class__.__name__, id(self), self.name, type_name)

1286

1287

1288class TextareaElement(InputMixin, HtmlElement):

1289 """

1290 ``<textarea>`` element. You can get the name with ``.name`` and

1291 get/set the value with ``.value``

1292 """

1293 @property

1294 def value(self):

1295 """

1296 Get/set the value (which is the contents of this element)

1297 """

1298 content = self.text or ''

1299 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):

1300 serialisation_method = 'xml'

1301 else:

1302 serialisation_method = 'html'

1303 for el in self:

1304 # it's rare that we actually get here, so let's not use ''.join()

1305 content += etree.tostring(

1306 el, method=serialisation_method, encoding='unicode')

1307 return content

1308

1309 @value.setter

1310 def value(self, value):

1311 del self[:]

1312 self.text = value

1313

1314 @value.deleter

1315 def value(self):

1316 self.text = ''

1317 del self[:]

1318

1319

1320HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1321

1322

1323class SelectElement(InputMixin, HtmlElement):

1324 """

1325 ``<select>`` element. You can get the name with ``.name``.

1326

1327 ``.value`` will be the value of the selected option, unless this

1328 is a multi-select element (``<select multiple>``), in which case

1329 it will be a set-like object. In either case ``.value_options``

1330 gives the possible values.

1331

1332 The boolean attribute ``.multiple`` shows if this is a

1333 multi-select.

1334 """

1335 @property

1336 def value(self):

1337 """

1338 Get/set the value of this select (the selected option).

1339

1340 If this is a multi-select, this is a set-like object that

1341 represents all the selected options.

1342 """

1343 if self.multiple:

1344 return MultipleSelectOptions(self)

1345 options = _options_xpath(self)

1346

1347 try:

1348 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)

1349 except StopIteration:

1350 try:

1351 selected_option = next(el for el in options if el.get('disabled') is None)

1352 except StopIteration:

1353 return None

1354 value = selected_option.get('value')

1355 if value is None:

1356 value = (selected_option.text or '').strip()

1357 return value

1358

1359 @value.setter

1360 def value(self, value):

1361 if self.multiple:

1362 if isinstance(value, basestring):

1363 raise TypeError("You must pass in a sequence")

1364 values = self.value

1365 values.clear()

1366 values.update(value)

1367 return

1368 checked_option = None

1369 if value is not None:

1370 for el in _options_xpath(self):

1371 opt_value = el.get('value')

1372 if opt_value is None:

1373 opt_value = (el.text or '').strip()

1374 if opt_value == value:

1375 checked_option = el

1376 break

1377 else:

1378 raise ValueError(

1379 "There is no option with the value of %r" % value)

1380 for el in _options_xpath(self):

1381 if 'selected' in el.attrib:

1382 del el.attrib['selected']

1383 if checked_option is not None:

1384 checked_option.set('selected', '')

1385

1386 @value.deleter

1387 def value(self):

1388 # FIXME: should del be allowed at all?

1389 if self.multiple:

1390 self.value.clear()

1391 else:

1392 self.value = None

1393

1394 @property

1395 def value_options(self):

1396 """

1397 All the possible values this select can have (the ``value``

1398 attribute of all the ``<option>`` elements.

1399 """

1400 options = []

1401 for el in _options_xpath(self):

1402 value = el.get('value')

1403 if value is None:

1404 value = (el.text or '').strip()

1405 options.append(value)

1406 return options

1407

1408 @property

1409 def multiple(self):

1410 """

1411 Boolean attribute: is there a ``multiple`` attribute on this element.

1412 """

1413 return 'multiple' in self.attrib

1414

1415 @multiple.setter

1416 def multiple(self, value):

1417 if value:

1418 self.set('multiple', '')

1419 elif 'multiple' in self.attrib:

1420 del self.attrib['multiple']

1421

1422

1423HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1424

1425

1426class MultipleSelectOptions(SetMixin):

1427 """

1428 Represents all the selected options in a ``<select multiple>`` element.

1429

1430 You can add to this set-like option to select an option, or remove

1431 to unselect the option.

1432 """

1433

1434 def __init__(self, select):

1435 self.select = select

1436

1437 @property

1438 def options(self):

1439 """

1440 Iterator of all the ``<option>`` elements.

1441 """

1442 return iter(_options_xpath(self.select))

1443

1444 def __iter__(self):

1445 for option in self.options:

1446 if 'selected' in option.attrib:

1447 opt_value = option.get('value')

1448 if opt_value is None:

1449 opt_value = (option.text or '').strip()

1450 yield opt_value

1451

1452 def add(self, item):

1453 for option in self.options:

1454 opt_value = option.get('value')

1455 if opt_value is None:

1456 opt_value = (option.text or '').strip()

1457 if opt_value == item:

1458 option.set('selected', '')

1459 break

1460 else:

1461 raise ValueError(

1462 "There is no option with the value %r" % item)

1463

1464 def remove(self, item):

1465 for option in self.options:

1466 opt_value = option.get('value')

1467 if opt_value is None:

1468 opt_value = (option.text or '').strip()

1469 if opt_value == item:

1470 if 'selected' in option.attrib:

1471 del option.attrib['selected']

1472 else:

1473 raise ValueError(

1474 "The option %r is not currently selected" % item)

1475 break

1476 else:

1477 raise ValueError(

1478 "There is not option with the value %r" % item)

1479

1480 def __repr__(self):

1481 return '<%s {%s} for select name=%r>' % (

1482 self.__class__.__name__,

1483 ', '.join([repr(v) for v in self]),

1484 self.select.name)

1485

1486

1487class RadioGroup(list):

1488 """

1489 This object represents several ``<input type=radio>`` elements

1490 that have the same name.

1491

1492 You can use this like a list, but also use the property

1493 ``.value`` to check/uncheck inputs. Also you can use

1494 ``.value_options`` to get the possible values.

1495 """

1496 @property

1497 def value(self):

1498 """

1499 Get/set the value, which checks the radio with that value (and

1500 unchecks any other value).

1501 """

1502 for el in self:

1503 if 'checked' in el.attrib:

1504 return el.get('value')

1505 return None

1506

1507 @value.setter

1508 def value(self, value):

1509 checked_option = None

1510 if value is not None:

1511 for el in self:

1512 if el.get('value') == value:

1513 checked_option = el

1514 break

1515 else:

1516 raise ValueError("There is no radio input with the value %r" % value)

1517 for el in self:

1518 if 'checked' in el.attrib:

1519 del el.attrib['checked']

1520 if checked_option is not None:

1521 checked_option.set('checked', '')

1522

1523 @value.deleter

1524 def value(self):

1525 self.value = None

1526

1527 @property

1528 def value_options(self):

1529 """

1530 Returns a list of all the possible values.

1531 """

1532 return [el.get('value') for el in self]

1533

1534 def __repr__(self):

1535 return '%s(%s)' % (

1536 self.__class__.__name__,

1537 list.__repr__(self))

1538

1539

1540class CheckboxGroup(list):

1541 """

1542 Represents a group of checkboxes (``<input type=checkbox>``) that

1543 have the same name.

1544

1545 In addition to using this like a list, the ``.value`` attribute

1546 returns a set-like object that you can add to or remove from to

1547 check and uncheck checkboxes. You can also use ``.value_options``

1548 to get the possible values.

1549 """

1550 @property

1551 def value(self):

1552 """

1553 Return a set-like object that can be modified to check or

1554 uncheck individual checkboxes according to their value.

1555 """

1556 return CheckboxValues(self)

1557

1558 @value.setter

1559 def value(self, value):

1560 values = self.value

1561 values.clear()

1562 if not hasattr(value, '__iter__'):

1563 raise ValueError(

1564 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"

1565 % (self[0].name, value))

1566 values.update(value)

1567

1568 @value.deleter

1569 def value(self):

1570 self.value.clear()

1571

1572 @property

1573 def value_options(self):

1574 """

1575 Returns a list of all the possible values.

1576 """

1577 return [el.get('value') for el in self]

1578

1579 def __repr__(self):

1580 return '%s(%s)' % (

1581 self.__class__.__name__, list.__repr__(self))

1582

1583

1584class CheckboxValues(SetMixin):

1585 """

1586 Represents the values of the checked checkboxes in a group of

1587 checkboxes with the same name.

1588 """

1589

1590 def __init__(self, group):

1591 self.group = group

1592

1593 def __iter__(self):

1594 return iter([

1595 el.get('value')

1596 for el in self.group

1597 if 'checked' in el.attrib])

1598

1599 def add(self, value):

1600 for el in self.group:

1601 if el.get('value') == value:

1602 el.set('checked', '')

1603 break

1604 else:

1605 raise KeyError("No checkbox with value %r" % value)

1606

1607 def remove(self, value):

1608 for el in self.group:

1609 if el.get('value') == value:

1610 if 'checked' in el.attrib:

1611 del el.attrib['checked']

1612 else:

1613 raise KeyError(

1614 "The checkbox with value %r was already unchecked" % value)

1615 break

1616 else:

1617 raise KeyError(

1618 "No checkbox with value %r" % value)

1619

1620 def __repr__(self):

1621 return '<%s {%s} for checkboxes name=%r>' % (

1622 self.__class__.__name__,

1623 ', '.join([repr(v) for v in self]),

1624 self.group.name)

1625

1626

1627class InputElement(InputMixin, HtmlElement):

1628 """

1629 Represents an ``<input>`` element.

1630

1631 You can get the type with ``.type`` (which is lower-cased and

1632 defaults to ``'text'``).

1633

1634 Also you can get and set the value with ``.value``

1635

1636 Checkboxes and radios have the attribute ``input.checkable ==

1637 True`` (for all others it is false) and a boolean attribute

1638 ``.checked``.

1639

1640 """

1641

1642 ## FIXME: I'm a little uncomfortable with the use of .checked

1643 @property

1644 def value(self):

1645 """

1646 Get/set the value of this element, using the ``value`` attribute.

1647

1648 Also, if this is a checkbox and it has no value, this defaults

1649 to ``'on'``. If it is a checkbox or radio that is not

1650 checked, this returns None.

1651 """

1652 if self.checkable:

1653 if self.checked:

1654 return self.get('value') or 'on'

1655 else:

1656 return None

1657 return self.get('value')

1658

1659 @value.setter

1660 def value(self, value):

1661 if self.checkable:

1662 if not value:

1663 self.checked = False

1664 else:

1665 self.checked = True

1666 if isinstance(value, basestring):

1667 self.set('value', value)

1668 else:

1669 self.set('value', value)

1670

1671 @value.deleter

1672 def value(self):

1673 if self.checkable:

1674 self.checked = False

1675 else:

1676 if 'value' in self.attrib:

1677 del self.attrib['value']

1678

1679 @property

1680 def type(self):

1681 """

1682 Return the type of this element (using the type attribute).

1683 """

1684 return self.get('type', 'text').lower()

1685

1686 @type.setter

1687 def type(self, value):

1688 self.set('type', value)

1689

1690 @property

1691 def checkable(self):

1692 """

1693 Boolean: can this element be checked?

1694 """

1695 return self.type in ('checkbox', 'radio')

1696

1697 @property

1698 def checked(self):

1699 """

1700 Boolean attribute to get/set the presence of the ``checked``

1701 attribute.

1702

1703 You can only use this on checkable input types.

1704 """

1705 if not self.checkable:

1706 raise AttributeError('Not a checkable input type')

1707 return 'checked' in self.attrib

1708

1709 @checked.setter

1710 def checked(self, value):

1711 if not self.checkable:

1712 raise AttributeError('Not a checkable input type')

1713 if value:

1714 self.set('checked', '')

1715 else:

1716 attrib = self.attrib

1717 if 'checked' in attrib:

1718 del attrib['checked']

1719

1720

1721HtmlElementClassLookup._default_element_classes['input'] = InputElement

1722

1723

1724class LabelElement(HtmlElement):

1725 """

1726 Represents a ``<label>`` element.

1727

1728 Label elements are linked to other elements with their ``for``

1729 attribute. You can access this element with ``label.for_element``.

1730 """

1731 @property

1732 def for_element(self):

1733 """

1734 Get/set the element this label points to. Return None if it

1735 can't be found.

1736 """

1737 id = self.get('for')

1738 if not id:

1739 return None

1740 return self.body.get_element_by_id(id)

1741

1742 @for_element.setter

1743 def for_element(self, other):

1744 id = other.get('id')

1745 if not id:

1746 raise TypeError(

1747 "Element %r has no id attribute" % other)

1748 self.set('for', id)

1749

1750 @for_element.deleter

1751 def for_element(self):

1752 attrib = self.attrib

1753 if 'id' in attrib:

1754 del attrib['id']

1755

1756

1757HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1758

1759

1760############################################################

1761## Serialization

1762############################################################

1763

1764def html_to_xhtml(html):

1765 """Convert all tags in an HTML tree to XHTML by moving them to the

1766 XHTML namespace.

1767 """

1768 try:

1769 html = html.getroot()

1770 except AttributeError:

1771 pass

1772 prefix = "{%s}" % XHTML_NAMESPACE

1773 for el in html.iter(etree.Element):

1774 tag = el.tag

1775 if tag[0] != '{':

1776 el.tag = prefix + tag

1777

1778

1779def xhtml_to_html(xhtml):

1780 """Convert all tags in an XHTML tree to HTML by removing their

1781 XHTML namespace.

1782 """

1783 try:

1784 xhtml = xhtml.getroot()

1785 except AttributeError:

1786 pass

1787 prefix = "{%s}" % XHTML_NAMESPACE

1788 prefix_len = len(prefix)

1789 for el in xhtml.iter(prefix + "*"):

1790 el.tag = el.tag[prefix_len:]

1791

1792

1793# This isn't a general match, but it's a match for what libxml2

1794# specifically serialises:

1795__str_replace_meta_content_type = re.compile(

1796 r'<meta http-equiv="Content-Type"[^>]*>').sub

1797__bytes_replace_meta_content_type = re.compile(

1798 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub

1799

1800

1801def tostring(doc, pretty_print=False, include_meta_content_type=False,

1802 encoding=None, method="html", with_tail=True, doctype=None):

1803 """Return an HTML string representation of the document.

1804

1805 Note: if include_meta_content_type is true this will create a

1806 ``<meta http-equiv="Content-Type" ...>`` tag in the head;

1807 regardless of the value of include_meta_content_type any existing

1808 ``<meta http-equiv="Content-Type" ...>`` tag will be removed

1809

1810 The ``encoding`` argument controls the output encoding (defaults to

1811 ASCII, with &#...; character references for any characters outside

1812 of ASCII). Note that you can pass the name ``'unicode'`` as

1813 ``encoding`` argument to serialise to a Unicode string.

1814

1815 The ``method`` argument defines the output method. It defaults to

1816 'html', but can also be 'xml' for xhtml output, or 'text' to

1817 serialise to plain text without markup.

1818

1819 To leave out the tail text of the top-level element that is being

1820 serialised, pass ``with_tail=False``.

1821

1822 The ``doctype`` option allows passing in a plain string that will

1823 be serialised before the XML tree. Note that passing in non

1824 well-formed content here will make the XML output non well-formed.

1825 Also, an existing doctype in the document tree will not be removed

1826 when serialising an ElementTree instance.

1827

1828 Example::

1829

1830 >>> from lxml import html

1831 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')

1832

1833 >>> html.tostring(root)

1834 b'<p>Hello<br>world!</p>'

1835 >>> html.tostring(root, method='html')

1836 b'<p>Hello<br>world!</p>'

1837

1838 >>> html.tostring(root, method='xml')

1839 b'<p>Hello<br/>world!</p>'

1840

1841 >>> html.tostring(root, method='text')

1842 b'Helloworld!'

1843

1844 >>> html.tostring(root, method='text', encoding='unicode')

1845 u'Helloworld!'

1846

1847 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')

1848 >>> html.tostring(root[0], method='text', encoding='unicode')

1849 u'Helloworld!TAIL'

1850

1851 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)

1852 u'Helloworld!'

1853

1854 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')

1855 >>> html.tostring(doc, method='html', encoding='unicode')

1856 u'<html><body><p>Hello<br>world!</p></body></html>'

1857

1858 >>> print(html.tostring(doc, method='html', encoding='unicode',

1859 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'

1860 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))

1861 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">

1862 <html><body><p>Hello<br>world!</p></body></html>

1863 """

1864 html = etree.tostring(doc, method=method, pretty_print=pretty_print,

1865 encoding=encoding, with_tail=with_tail,

1866 doctype=doctype)

1867 if method == 'html' and not include_meta_content_type:

1868 if isinstance(html, str):

1869 html = __str_replace_meta_content_type('', html)

1870 else:

1871 html = __bytes_replace_meta_content_type(bytes(), html)

1872 return html

1873

1874

1875tostring.__doc__ = __fix_docstring(tostring.__doc__)

1876

1877

1878def open_in_browser(doc, encoding=None):

1879 """

1880 Open the HTML document in a web browser, saving it to a temporary

1881 file to open it. Note that this does not delete the file after

1882 use. This is mainly meant for debugging.

1883 """

1884 import os

1885 import webbrowser

1886 import tempfile

1887 if not isinstance(doc, etree._ElementTree):

1888 doc = etree.ElementTree(doc)

1889 handle, fn = tempfile.mkstemp(suffix='.html')

1890 f = os.fdopen(handle, 'wb')

1891 try:

1892 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")

1893 finally:

1894 # we leak the file itself here, but we should at least close it

1895 f.close()

1896 url = 'file://' + fn.replace(os.path.sep, '/')

1897 print(url)

1898 webbrowser.open(url)

1899

1900

1901################################################################################

1902# configure Element class lookup

1903################################################################################

1904

1905class HTMLParser(etree.HTMLParser):

1906 """An HTML parser that is configured to return lxml.html Element

1907 objects.

1908 """

1909 def __init__(self, **kwargs):

1910 super(HTMLParser, self).__init__(**kwargs)

1911 self.set_element_class_lookup(HtmlElementClassLookup())

1912

1913

1914class XHTMLParser(etree.XMLParser):

1915 """An XML parser that is configured to return lxml.html Element

1916 objects.

1917

1918 Note that this parser is not really XHTML aware unless you let it

1919 load a DTD that declares the HTML entities. To do this, make sure

1920 you have the XHTML DTDs installed in your catalogs, and create the

1921 parser like this::

1922

1923 >>> parser = XHTMLParser(load_dtd=True)

1924

1925 If you additionally want to validate the document, use this::

1926

1927 >>> parser = XHTMLParser(dtd_validation=True)

1928

1929 For catalog support, see http://www.xmlsoft.org/catalog.html.

1930 """

1931 def __init__(self, **kwargs):

1932 super(XHTMLParser, self).__init__(**kwargs)

1933 self.set_element_class_lookup(HtmlElementClassLookup())

1934

1935

1936def Element(*args, **kw):

1937 """Create a new HTML Element.

1938

1939 This can also be used for XHTML documents.

1940 """

1941 v = html_parser.makeelement(*args, **kw)

1942 return v

1943

1944

1945html_parser = HTMLParser()

1946xhtml_parser = XHTMLParser()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lxml/html/init.py: 4%

985 statements