Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/lxml/html/__init_

3# Redistribution and use in source and binary forms, with or without

4# modification, are permitted provided that the following conditions are

5# met:

7# 1. Redistributions of source code must retain the above copyright

8# notice, this list of conditions and the following disclaimer.

10# 2. Redistributions in binary form must reproduce the above copyright

11# notice, this list of conditions and the following disclaimer in

12# the documentation and/or other materials provided with the

13# distribution.

14#

15# 3. Neither the name of Ian Bicking nor the names of its contributors may

16# be used to endorse or promote products derived from this software

17# without specific prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

31"""The ``lxml.html`` tool set for HTML handling.

32"""

35__all__ = [

36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',

37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',

38 'find_rel_links', 'find_class', 'make_links_absolute',

39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']

42import copy

43import re

45from collections.abc import MutableMapping, MutableSet

46from functools import partial

47from urllib.parse import urljoin

49from .. import etree

50from . import defs

51from ._setmixin import SetMixin

54def __fix_docstring(s):

55 # TODO: remove and clean up doctests

56 if not s:

57 return s

58 sub = re.compile(r"^(\s*)u'", re.M).sub

59 return sub(r"\1'", s)

62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",

65 namespaces={'x':XHTML_NAMESPACE})

66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",

67 namespaces={'x':XHTML_NAMESPACE})

68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",

69 namespaces={'x':XHTML_NAMESPACE})

70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})

71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")

72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")

73_collect_string_content = etree.XPath("string()")

74_iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer

75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer

76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",

77 namespaces={'x':XHTML_NAMESPACE})

78_archive_re = re.compile(r'[^ ]+')

79_parse_meta_refresh_url = re.compile(

80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

83def _unquote_match(s, pos):

84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":

85 return s[1:-1], pos+1

86 else:

87 return s,pos

90def _transform_result(typ, result):

91 """Convert the result back into the input type.

92 """

93 if issubclass(typ, bytes):

94 return tostring(result, encoding='utf-8')

95 elif issubclass(typ, str):

96 return tostring(result, encoding='unicode')

97 else:

98 return result

100

101def _nons(tag):

102 if isinstance(tag, str):

103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:

104 return tag.split('}')[-1]

105 return tag

106

107

108class Classes(MutableSet):

109 """Provides access to an element's class attribute as a set-like collection.

110 Usage::

111

112 >>> el = fromstring('<p class="hidden large">Text</p>')

113 >>> classes = el.classes # or: classes = Classes(el.attrib)

114 >>> classes |= ['block', 'paragraph']

115 >>> el.get('class')

116 'hidden large block paragraph'

117 >>> classes.toggle('hidden')

118 False

119 >>> el.get('class')

120 'large block paragraph'

121 >>> classes -= ('some', 'classes', 'block')

122 >>> el.get('class')

123 'large paragraph'

124 """

125 def __init__(self, attributes):

126 self._attributes = attributes

127 self._get_class_value = partial(attributes.get, 'class', '')

128

129 def add(self, value):

130 """

131 Add a class.

132

133 This has no effect if the class is already present.

134 """

135 if not value or re.search(r'\s', value):

136 raise ValueError("Invalid class name: %r" % value)

137 classes = self._get_class_value().split()

138 if value in classes:

139 return

140 classes.append(value)

141 self._attributes['class'] = ' '.join(classes)

142

143 def discard(self, value):

144 """

145 Remove a class if it is currently present.

146

147 If the class is not present, do nothing.

148 """

149 if not value or re.search(r'\s', value):

150 raise ValueError("Invalid class name: %r" % value)

151 classes = [name for name in self._get_class_value().split()

152 if name != value]

153 if classes:

154 self._attributes['class'] = ' '.join(classes)

155 elif 'class' in self._attributes:

156 del self._attributes['class']

157

158 def remove(self, value):

159 """

160 Remove a class; it must currently be present.

161

162 If the class is not present, raise a KeyError.

163 """

164 if not value or re.search(r'\s', value):

165 raise ValueError("Invalid class name: %r" % value)

166 super().remove(value)

167

168 def __contains__(self, name):

169 classes = self._get_class_value()

170 return name in classes and name in classes.split()

171

172 def __iter__(self):

173 return iter(self._get_class_value().split())

174

175 def __len__(self):

176 return len(self._get_class_value().split())

177

178 # non-standard methods

179

180 def update(self, values):

181 """

182 Add all names from 'values'.

183 """

184 classes = self._get_class_value().split()

185 extended = False

186 for value in values:

187 if value not in classes:

188 classes.append(value)

189 extended = True

190 if extended:

191 self._attributes['class'] = ' '.join(classes)

192

193 def toggle(self, value):

194 """

195 Add a class name if it isn't there yet, or remove it if it exists.

196

197 Returns true if the class was added (and is now enabled) and

198 false if it was removed (and is now disabled).

199 """

200 if not value or re.search(r'\s', value):

201 raise ValueError("Invalid class name: %r" % value)

202 classes = self._get_class_value().split()

203 try:

204 classes.remove(value)

205 enabled = False

206 except ValueError:

207 classes.append(value)

208 enabled = True

209 if classes:

210 self._attributes['class'] = ' '.join(classes)

211 else:

212 del self._attributes['class']

213 return enabled

214

215

216class HtmlMixin:

217

218 def set(self, key, value=None):

219 """set(self, key, value=None)

220

221 Sets an element attribute. If no value is provided, or if the value is None,

222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"

223 for ``form.set('novalidate')``.

224 """

225 super().set(key, value)

226

227 @property

228 def classes(self):

229 """

230 A set-like wrapper around the 'class' attribute.

231 """

232 return Classes(self.attrib)

233

234 @classes.setter

235 def classes(self, classes):

236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.

237 value = classes._get_class_value()

238 if value:

239 self.set('class', value)

240 elif self.get('class') is not None:

241 del self.attrib['class']

242

243 @property

244 def base_url(self):

245 """

246 Returns the base URL, given when the page was parsed.

247

248 Use with ``urlparse.urljoin(el.base_url, href)`` to get

249 absolute URLs.

250 """

251 return self.getroottree().docinfo.URL

252

253 @property

254 def forms(self):

255 """

256 Return a list of all the forms

257 """

258 return _forms_xpath(self)

259

260 @property

261 def body(self):

262 """

263 Return the <body> element. Can be called from a child element

264 to get the document's head.

265 """

266 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

267

268 @property

269 def head(self):

270 """

271 Returns the <head> element. Can be called from a child

272 element to get the document's head.

273 """

274 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

275

276 @property

277 def label(self):

278 """

279 Get or set any <label> element associated with this element.

280 """

281 id = self.get('id')

282 if not id:

283 return None

284 result = _label_xpath(self, id=id)

285 if not result:

286 return None

287 else:

288 return result[0]

289

290 @label.setter

291 def label(self, label):

292 id = self.get('id')

293 if not id:

294 raise TypeError(

295 "You cannot set a label for an element (%r) that has no id"

296 % self)

297 if _nons(label.tag) != 'label':

298 raise TypeError(

299 "You can only assign label to a label element (not %r)"

300 % label)

301 label.set('for', id)

302

303 @label.deleter

304 def label(self):

305 label = self.label

306 if label is not None:

307 del label.attrib['for']

308

309 def drop_tree(self):

310 """

311 Removes this element from the tree, including its children and

312 text. The tail text is joined to the previous element or

313 parent.

314 """

315 parent = self.getparent()

316 assert parent is not None

317 if self.tail:

318 previous = self.getprevious()

319 if previous is None:

320 parent.text = (parent.text or '') + self.tail

321 else:

322 previous.tail = (previous.tail or '') + self.tail

323 parent.remove(self)

324

325 def drop_tag(self):

326 """

327 Remove the tag, but not its children or text. The children and text

328 are merged into the parent.

329

330 Example::

331

332 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')

333 >>> h.find('.//b').drop_tag()

334 >>> print(tostring(h, encoding='unicode'))

335 <div>Hello World!</div>

336 """

337 parent = self.getparent()

338 assert parent is not None

339 previous = self.getprevious()

340 if self.text and isinstance(self.tag, str):

341 # not a Comment, etc.

342 if previous is None:

343 parent.text = (parent.text or '') + self.text

344 else:

345 previous.tail = (previous.tail or '') + self.text

346 if self.tail:

347 if len(self):

348 last = self[-1]

349 last.tail = (last.tail or '') + self.tail

350 elif previous is None:

351 parent.text = (parent.text or '') + self.tail

352 else:

353 previous.tail = (previous.tail or '') + self.tail

354 index = parent.index(self)

355 parent[index:index+1] = self[:]

356

357 def find_rel_links(self, rel):

358 """

359 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.

360 """

361 rel = rel.lower()

362 return [el for el in _rel_links_xpath(self)

363 if el.get('rel').lower() == rel]

364

365 def find_class(self, class_name):

366 """

367 Find any elements with the given class name.

368 """

369 return _class_xpath(self, class_name=class_name)

370

371 def get_element_by_id(self, id, *default):

372 """

373 Get the first element in a document with the given id. If none is

374 found, return the default argument if provided or raise KeyError

375 otherwise.

376

377 Note that there can be more than one element with the same id,

378 and this isn't uncommon in HTML documents found in the wild.

379 Browsers return only the first match, and this function does

380 the same.

381 """

382 try:

383 # FIXME: should this check for multiple matches?

384 # browsers just return the first one

385 return _id_xpath(self, id=id)[0]

386 except IndexError:

387 if default:

388 return default[0]

389 else:

390 raise KeyError(id)

391

392 def text_content(self):

393 """

394 Return the text content of the tag (and the text in any children).

395 """

396 return _collect_string_content(self)

397

398 def cssselect(self, expr, translator='html'):

399 """

400 Run the CSS expression on this element and its children,

401 returning a list of the results.

402

403 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)

404 -- note that pre-compiling the expression can provide a substantial

405 speedup.

406 """

407 # Do the import here to make the dependency optional.

408 from lxml.cssselect import CSSSelector

409 return CSSSelector(expr, translator=translator)(self)

410

411 ########################################

412 ## Link functions

413 ########################################

414

415 def make_links_absolute(self, base_url=None, resolve_base_href=True,

416 handle_failures=None):

417 """

418 Make all links in the document absolute, given the

419 ``base_url`` for the document (the full URL where the document

420 came from), or if no ``base_url`` is given, then the ``.base_url``

421 of the document.

422

423 If ``resolve_base_href`` is true, then any ``<base href>``

424 tags in the document are used *and* removed from the document.

425 If it is false then any such tag is ignored.

426

427 If ``handle_failures`` is None (default), a failure to process

428 a URL will abort the processing. If set to 'ignore', errors

429 are ignored. If set to 'discard', failing URLs will be removed.

430 """

431 if base_url is None:

432 base_url = self.base_url

433 if base_url is None:

434 raise TypeError(

435 "No base_url given, and the document has no base_url")

436 if resolve_base_href:

437 self.resolve_base_href()

438

439 if handle_failures == 'ignore':

440 def link_repl(href):

441 try:

442 return urljoin(base_url, href)

443 except ValueError:

444 return href

445 elif handle_failures == 'discard':

446 def link_repl(href):

447 try:

448 return urljoin(base_url, href)

449 except ValueError:

450 return None

451 elif handle_failures is None:

452 def link_repl(href):

453 return urljoin(base_url, href)

454 else:

455 raise ValueError(

456 "unexpected value for handle_failures: %r" % handle_failures)

457

458 self.rewrite_links(link_repl)

459

460 def resolve_base_href(self, handle_failures=None):

461 """

462 Find any ``<base href>`` tag in the document, and apply its

463 values to all links found in the document. Also remove the

464 tag once it has been applied.

465

466 If ``handle_failures`` is None (default), a failure to process

467 a URL will abort the processing. If set to 'ignore', errors

468 are ignored. If set to 'discard', failing URLs will be removed.

469 """

470 base_href = None

471 basetags = self.xpath('//base[@href]|//x:base[@href]',

472 namespaces={'x': XHTML_NAMESPACE})

473 for b in basetags:

474 base_href = b.get('href')

475 b.drop_tree()

476 if not base_href:

477 return

478 self.make_links_absolute(base_href, resolve_base_href=False,

479 handle_failures=handle_failures)

480

481 def iterlinks(self):

482 """

483 Yield (element, attribute, link, pos), where attribute may be None

484 (indicating the link is in the text). ``pos`` is the position

485 where the link occurs; often 0, but sometimes something else in

486 the case of links in stylesheets or style tags.

487

488 Note: <base href> is *not* taken into account in any way. The

489 link you get is exactly the link in the document.

490

491 Note: multiple links inside of a single text string or

492 attribute value are returned in reversed order. This makes it

493 possible to replace or delete them from the text string value

494 based on their reported text positions. Otherwise, a

495 modification at one text position can change the positions of

496 links reported later on.

497 """

498 link_attrs = defs.link_attrs

499 for el in self.iter(etree.Element):

500 attribs = el.attrib

501 tag = _nons(el.tag)

502 if tag == 'object':

503 codebase = None

504 ## <object> tags have attributes that are relative to

505 ## codebase

506 if 'codebase' in attribs:

507 codebase = el.get('codebase')

508 yield (el, 'codebase', codebase, 0)

509 for attrib in ('classid', 'data'):

510 if attrib in attribs:

511 value = el.get(attrib)

512 if codebase is not None:

513 value = urljoin(codebase, value)

514 yield (el, attrib, value, 0)

515 if 'archive' in attribs:

516 for match in _archive_re.finditer(el.get('archive')):

517 value = match.group(0)

518 if codebase is not None:

519 value = urljoin(codebase, value)

520 yield (el, 'archive', value, match.start())

521 else:

522 for attrib in link_attrs:

523 if attrib in attribs:

524 yield (el, attrib, attribs[attrib], 0)

525 if tag == 'meta':

526 http_equiv = attribs.get('http-equiv', '').lower()

527 if http_equiv == 'refresh':

528 content = attribs.get('content', '')

529 match = _parse_meta_refresh_url(content)

530 url = (match.group('url') if match else content).strip()

531 # unexpected content means the redirect won't work, but we might

532 # as well be permissive and return the entire string.

533 if url:

534 url, pos = _unquote_match(

535 url, match.start('url') if match else content.find(url))

536 yield (el, 'content', url, pos)

537 elif tag == 'param':

538 valuetype = el.get('valuetype') or ''

539 if valuetype.lower() == 'ref':

540 ## FIXME: while it's fine we *find* this link,

541 ## according to the spec we aren't supposed to

542 ## actually change the value, including resolving

543 ## it. It can also still be a link, even if it

544 ## doesn't have a valuetype="ref" (which seems to be the norm)

545 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype

546 yield (el, 'value', el.get('value'), 0)

547 elif tag == 'style' and el.text:

548 urls = [

549 # (start_pos, url)

550 _unquote_match(match.group(1), match.start(1))[::-1]

551 for match in _iter_css_urls(el.text)

552 ] + [

553 (match.start(1), match.group(1))

554 for match in _iter_css_imports(el.text)

555 ]

556 if urls:

557 # sort by start pos to bring both match sets back into order

558 # and reverse the list to report correct positions despite

559 # modifications

560 urls.sort(reverse=True)

561 for start, url in urls:

562 yield (el, None, url, start)

563 if 'style' in attribs:

564 urls = list(_iter_css_urls(attribs['style']))

565 if urls:

566 # return in reversed order to simplify in-place modifications

567 for match in urls[::-1]:

568 url, start = _unquote_match(match.group(1), match.start(1))

569 yield (el, 'style', url, start)

570

571 def rewrite_links(self, link_repl_func, resolve_base_href=True,

572 base_href=None):

573 """

574 Rewrite all the links in the document. For each link

575 ``link_repl_func(link)`` will be called, and the return value

576 will replace the old link.

577

578 Note that links may not be absolute (unless you first called

579 ``make_links_absolute()``), and may be internal (e.g.,

580 ``'#anchor'``). They can also be values like

581 ``'mailto:email'`` or ``'javascript:expr'``.

582

583 If you give ``base_href`` then all links passed to

584 ``link_repl_func()`` will take that into account.

585

586 If the ``link_repl_func`` returns None, the attribute or

587 tag text will be removed completely.

588 """

589 if base_href is not None:

590 # FIXME: this can be done in one pass with a wrapper

591 # around link_repl_func

592 self.make_links_absolute(

593 base_href, resolve_base_href=resolve_base_href)

594 elif resolve_base_href:

595 self.resolve_base_href()

596

597 for el, attrib, link, pos in self.iterlinks():

598 new_link = link_repl_func(link.strip())

599 if new_link == link:

600 continue

601 if new_link is None:

602 # Remove the attribute or element content

603 if attrib is None:

604 el.text = ''

605 else:

606 del el.attrib[attrib]

607 continue

608

609 if attrib is None:

610 new = el.text[:pos] + new_link + el.text[pos+len(link):]

611 el.text = new

612 else:

613 cur = el.get(attrib)

614 if not pos and len(cur) == len(link):

615 new = new_link # most common case

616 else:

617 new = cur[:pos] + new_link + cur[pos+len(link):]

618 el.set(attrib, new)

619

620

621class _MethodFunc:

622 """

623 An object that represents a method on an element as a function;

624 the function takes either an element or an HTML string. It

625 returns whatever the function normally returns, or if the function

626 works in-place (and so returns None) it returns a serialized form

627 of the resulting document.

628 """

629 def __init__(self, name, copy=False, source_class=HtmlMixin):

630 self.name = name

631 self.copy = copy

632 self.__doc__ = getattr(source_class, self.name).__doc__

633 def __call__(self, doc, *args, **kw):

634 result_type = type(doc)

635 if isinstance(doc, (str, bytes)):

636 if 'copy' in kw:

637 raise TypeError(

638 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)

639 doc = fromstring(doc, **kw)

640 else:

641 if 'copy' in kw:

642 make_a_copy = kw.pop('copy')

643 else:

644 make_a_copy = self.copy

645 if make_a_copy:

646 doc = copy.deepcopy(doc)

647 meth = getattr(doc, self.name)

648 result = meth(*args, **kw)

649 # FIXME: this None test is a bit sloppy

650 if result is None:

651 # Then return what we got in

652 return _transform_result(result_type, doc)

653 else:

654 return result

655

656

657find_rel_links = _MethodFunc('find_rel_links', copy=False)

658find_class = _MethodFunc('find_class', copy=False)

659make_links_absolute = _MethodFunc('make_links_absolute', copy=True)

660resolve_base_href = _MethodFunc('resolve_base_href', copy=True)

661iterlinks = _MethodFunc('iterlinks', copy=False)

662rewrite_links = _MethodFunc('rewrite_links', copy=True)

663

664

665class HtmlComment(HtmlMixin, etree.CommentBase):

666 pass

667

668

669class HtmlElement(HtmlMixin, etree.ElementBase):

670 pass

671

672

673class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):

674 pass

675

676

677class HtmlEntity(HtmlMixin, etree.EntityBase):

678 pass

679

680

681class HtmlElementClassLookup(etree.CustomElementClassLookup):

682 """A lookup scheme for HTML Element classes.

683

684 To create a lookup instance with different Element classes, pass a tag

685 name mapping of Element classes in the ``classes`` keyword argument and/or

686 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.

687 The special key '*' denotes a Mixin class that should be mixed into all

688 Element classes.

689 """

690 _default_element_classes = {}

691

692 def __init__(self, classes=None, mixins=None):

693 etree.CustomElementClassLookup.__init__(self)

694 if classes is None:

695 classes = self._default_element_classes.copy()

696 if mixins:

697 mixers = {}

698 for name, value in mixins:

699 if name == '*':

700 for n in classes.keys():

701 mixers.setdefault(n, []).append(value)

702 else:

703 mixers.setdefault(name, []).append(value)

704 for name, mix_bases in mixers.items():

705 cur = classes.get(name, HtmlElement)

706 bases = tuple(mix_bases + [cur])

707 classes[name] = type(cur.__name__, bases, {})

708 self._element_classes = classes

709

710 def lookup(self, node_type, document, namespace, name):

711 if node_type == 'element':

712 return self._element_classes.get(name.lower(), HtmlElement)

713 elif node_type == 'comment':

714 return HtmlComment

715 elif node_type == 'PI':

716 return HtmlProcessingInstruction

717 elif node_type == 'entity':

718 return HtmlEntity

719 # Otherwise normal lookup

720 return None

721

722

723################################################################################

724# parsing

725################################################################################

726

727_looks_like_full_html_unicode = re.compile(

728 r'^\s*<(?:html|!doctype)', re.I).match

729_looks_like_full_html_bytes = re.compile(

730 br'^\s*<(?:html|!doctype)', re.I).match

731

732

733def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

734 if parser is None:

735 parser = html_parser

736 value = etree.fromstring(html, parser, **kw)

737 if value is None:

738 raise etree.ParserError(

739 "Document is empty")

740 if ensure_head_body and value.find('head') is None:

741 value.insert(0, Element('head'))

742 if ensure_head_body and value.find('body') is None:

743 value.append(Element('body'))

744 return value

745

746

747def fragments_fromstring(html, no_leading_text=False, base_url=None,

748 parser=None, **kw):

749 """Parses several HTML elements, returning a list of elements.

750

751 The first item in the list may be a string.

752 If no_leading_text is true, then it will be an error if there is

753 leading text, and it will always be a list of only elements.

754

755 base_url will set the document's base_url attribute

756 (and the tree's docinfo.URL).

757 """

758 if parser is None:

759 parser = html_parser

760 # FIXME: check what happens when you give html with a body, head, etc.

761 if isinstance(html, bytes):

762 if not _looks_like_full_html_bytes(html):

763 # can't use %-formatting in early Py3 versions

764 html = (b'<html><body>' + html +

765 b'</body></html>')

766 else:

767 if not _looks_like_full_html_unicode(html):

768 html = '<html><body>%s</body></html>' % html

769 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

770 assert _nons(doc.tag) == 'html'

771 bodies = [e for e in doc if _nons(e.tag) == 'body']

772 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))

773 body = bodies[0]

774 elements = []

775 if no_leading_text and body.text and body.text.strip():

776 raise etree.ParserError(

777 "There is leading text: %r" % body.text)

778 if body.text and body.text.strip():

779 elements.append(body.text)

780 elements.extend(body)

781 # FIXME: removing the reference to the parent artificial document

782 # would be nice

783 return elements

784

785

786def fragment_fromstring(html, create_parent=False, base_url=None,

787 parser=None, **kw):

788 """

789 Parses a single HTML element; it is an error if there is more than

790 one element, or if anything but whitespace precedes or follows the

791 element.

792

793 If ``create_parent`` is true (or is a tag name) then a parent node

794 will be created to encapsulate the HTML in a single element. In this

795 case, leading or trailing text is also allowed, as are multiple elements

796 as result of the parsing.

797

798 Passing a ``base_url`` will set the document's ``base_url`` attribute

799 (and the tree's docinfo.URL).

800 """

801 if parser is None:

802 parser = html_parser

803

804 accept_leading_text = bool(create_parent)

805

806 elements = fragments_fromstring(

807 html, parser=parser, no_leading_text=not accept_leading_text,

808 base_url=base_url, **kw)

809

810 if create_parent:

811 if not isinstance(create_parent, str):

812 create_parent = 'div'

813 new_root = Element(create_parent)

814 if elements:

815 if isinstance(elements[0], str):

816 new_root.text = elements[0]

817 del elements[0]

818 new_root.extend(elements)

819 return new_root

820

821 if not elements:

822 raise etree.ParserError('No elements found')

823 if len(elements) > 1:

824 raise etree.ParserError(

825 "Multiple elements found (%s)"

826 % ', '.join([_element_name(e) for e in elements]))

827 el = elements[0]

828 if el.tail and el.tail.strip():

829 raise etree.ParserError(

830 "Element followed by text: %r" % el.tail)

831 el.tail = None

832 return el

833

834

835def fromstring(html, base_url=None, parser=None, **kw):

836 """

837 Parse the html, returning a single element/document.

838

839 This tries to minimally parse the chunk of text, without knowing if it

840 is a fragment or a document.

841

842 base_url will set the document's base_url attribute (and the tree's docinfo.URL)

843 """

844 if parser is None:

845 parser = html_parser

846 if isinstance(html, bytes):

847 is_full_html = _looks_like_full_html_bytes(html)

848 else:

849 is_full_html = _looks_like_full_html_unicode(html)

850 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

851 if is_full_html:

852 return doc

853 # otherwise, lets parse it out...

854 bodies = doc.findall('body')

855 if not bodies:

856 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)

857 if bodies:

858 body = bodies[0]

859 if len(bodies) > 1:

860 # Somehow there are multiple bodies, which is bad, but just

861 # smash them into one body

862 for other_body in bodies[1:]:

863 if other_body.text:

864 if len(body):

865 body[-1].tail = (body[-1].tail or '') + other_body.text

866 else:

867 body.text = (body.text or '') + other_body.text

868 body.extend(other_body)

869 # We'll ignore tail

870 # I guess we are ignoring attributes too

871 other_body.drop_tree()

872 else:

873 body = None

874 heads = doc.findall('head')

875 if not heads:

876 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)

877 if heads:

878 # Well, we have some sort of structure, so lets keep it all

879 head = heads[0]

880 if len(heads) > 1:

881 for other_head in heads[1:]:

882 head.extend(other_head)

883 # We don't care about text or tail in a head

884 other_head.drop_tree()

885 return doc

886 if body is None:

887 return doc

888 if (len(body) == 1 and (not body.text or not body.text.strip())

889 and (not body[-1].tail or not body[-1].tail.strip())):

890 # The body has just one element, so it was probably a single

891 # element passed in

892 return body[0]

893 # Now we have a body which represents a bunch of tags which have the

894 # content that was passed in. We will create a fake container, which

895 # is the body tag, except <body> implies too much structure.

896 if _contains_block_level_tag(body):

897 body.tag = 'div'

898 else:

899 body.tag = 'span'

900 return body

901

902

903def parse(filename_or_url, parser=None, base_url=None, **kw):

904 """

905 Parse a filename, URL, or file-like object into an HTML document

906 tree. Note: this returns a tree, not an element. Use

907 ``parse(...).getroot()`` to get the document root.

908

909 You can override the base URL with the ``base_url`` keyword. This

910 is most useful when parsing from a file-like object.

911 """

912 if parser is None:

913 parser = html_parser

914 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

915

916

917def _contains_block_level_tag(el):

918 # FIXME: I could do this with XPath, but would that just be

919 # unnecessarily slow?

920 for el in el.iter(etree.Element):

921 if _nons(el.tag) in defs.block_tags:

922 return True

923 return False

924

925

926def _element_name(el):

927 if isinstance(el, etree.CommentBase):

928 return 'comment'

929 elif isinstance(el, str):

930 return 'string'

931 else:

932 return _nons(el.tag)

933

934

935################################################################################

936# form handling

937################################################################################

938

939class FormElement(HtmlElement):

940 """

941 Represents a <form> element.

942 """

943

944 @property

945 def inputs(self):

946 """

947 Returns an accessor for all the input elements in the form.

948

949 See `InputGetter` for more information about the object.

950 """

951 return InputGetter(self)

952

953 @property

954 def fields(self):

955 """

956 Dictionary-like object that represents all the fields in this

957 form. You can set values in this dictionary to effect the

958 form.

959 """

960 return FieldsDict(self.inputs)

961

962 @fields.setter

963 def fields(self, value):

964 fields = self.fields

965 prev_keys = fields.keys()

966 for key, value in value.items():

967 if key in prev_keys:

968 prev_keys.remove(key)

969 fields[key] = value

970 for key in prev_keys:

971 if key is None:

972 # Case of an unnamed input; these aren't really

973 # expressed in form_values() anyway.

974 continue

975 fields[key] = None

976

977 def _name(self):

978 if self.get('name'):

979 return self.get('name')

980 elif self.get('id'):

981 return '#' + self.get('id')

982 iter_tags = self.body.iter

983 forms = list(iter_tags('form'))

984 if not forms:

985 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))

986 return str(forms.index(self))

987

988 def form_values(self):

989 """

990 Return a list of tuples of the field values for the form.

991 This is suitable to be passed to ``urllib.urlencode()``.

992 """

993 results = []

994 for el in self.inputs:

995 name = el.name

996 if not name or 'disabled' in el.attrib:

997 continue

998 tag = _nons(el.tag)

999 if tag == 'textarea':

1000 results.append((name, el.value))

1001 elif tag == 'select':

1002 value = el.value

1003 if el.multiple:

1004 for v in value:

1005 results.append((name, v))

1006 elif value is not None:

1007 results.append((name, el.value))

1008 else:

1009 assert tag == 'input', (

1010 "Unexpected tag: %r" % el)

1011 if el.checkable and not el.checked:

1012 continue

1013 if el.type in ('submit', 'image', 'reset', 'file'):

1014 continue

1015 value = el.value

1016 if value is not None:

1017 results.append((name, el.value))

1018 return results

1019

1020 @property

1021 def action(self):

1022 """

1023 Get/set the form's ``action`` attribute.

1024 """

1025 base_url = self.base_url

1026 action = self.get('action')

1027 if base_url and action is not None:

1028 return urljoin(base_url, action)

1029 else:

1030 return action

1031

1032 @action.setter

1033 def action(self, value):

1034 self.set('action', value)

1035

1036 @action.deleter

1037 def action(self):

1038 attrib = self.attrib

1039 if 'action' in attrib:

1040 del attrib['action']

1041

1042 @property

1043 def method(self):

1044 """

1045 Get/set the form's method. Always returns a capitalized

1046 string, and defaults to ``'GET'``

1047 """

1048 return self.get('method', 'GET').upper()

1049

1050 @method.setter

1051 def method(self, value):

1052 self.set('method', value.upper())

1053

1054

1055HtmlElementClassLookup._default_element_classes['form'] = FormElement

1056

1057

1058def submit_form(form, extra_values=None, open_http=None):

1059 """

1060 Helper function to submit a form. Returns a file-like object, as from

1061 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,

1062 which shows the URL if there were any redirects.

1063

1064 You can use this like::

1065

1066 form = doc.forms[0]

1067 form.inputs['foo'].value = 'bar' # etc

1068 response = form.submit()

1069 doc = parse(response)

1070 doc.make_links_absolute(response.geturl())

1071

1072 To change the HTTP requester, pass a function as ``open_http`` keyword

1073 argument that opens the URL for you. The function must have the following

1074 signature::

1075

1076 open_http(method, URL, values)

1077

1078 The action is one of 'GET' or 'POST', the URL is the target URL as a

1079 string, and the values are a sequence of ``(name, value)`` tuples with the

1080 form data.

1081 """

1082 values = form.form_values()

1083 if extra_values:

1084 if hasattr(extra_values, 'items'):

1085 extra_values = extra_values.items()

1086 values.extend(extra_values)

1087 if open_http is None:

1088 open_http = open_http_urllib

1089 if form.action:

1090 url = form.action

1091 else:

1092 url = form.base_url

1093 return open_http(form.method, url, values)

1094

1095

1096def open_http_urllib(method, url, values):

1097 if not url:

1098 raise ValueError("cannot submit, no URL provided")

1099 ## FIXME: should test that it's not a relative URL or something

1100 try:

1101 from urllib import urlencode, urlopen

1102 except ImportError: # Python 3

1103 from urllib.request import urlopen

1104 from urllib.parse import urlencode

1105 if method == 'GET':

1106 if '?' in url:

1107 url += '&'

1108 else:

1109 url += '?'

1110 url += urlencode(values)

1111 data = None

1112 else:

1113 data = urlencode(values)

1114 if not isinstance(data, bytes):

1115 data = data.encode('ASCII')

1116 return urlopen(url, data)

1117

1118

1119class FieldsDict(MutableMapping):

1120

1121 def __init__(self, inputs):

1122 self.inputs = inputs

1123 def __getitem__(self, item):

1124 return self.inputs[item].value

1125 def __setitem__(self, item, value):

1126 self.inputs[item].value = value

1127 def __delitem__(self, item):

1128 raise KeyError(

1129 "You cannot remove keys from ElementDict")

1130 def keys(self):

1131 return self.inputs.keys()

1132 def __contains__(self, item):

1133 return item in self.inputs

1134 def __iter__(self):

1135 return iter(self.inputs.keys())

1136 def __len__(self):

1137 return len(self.inputs)

1138

1139 def __repr__(self):

1140 return '<%s for form %s>' % (

1141 self.__class__.__name__,

1142 self.inputs.form._name())

1143

1144

1145class InputGetter:

1146

1147 """

1148 An accessor that represents all the input fields in a form.

1149

1150 You can get fields by name from this, with

1151 ``form.inputs['field_name']``. If there are a set of checkboxes

1152 with the same name, they are returned as a list (a `CheckboxGroup`

1153 which also allows value setting). Radio inputs are handled

1154 similarly. Use ``.keys()`` and ``.items()`` to process all fields

1155 in this way.

1156

1157 You can also iterate over this to get all input elements. This

1158 won't return the same thing as if you get all the names, as

1159 checkboxes and radio elements are returned individually.

1160 """

1161

1162 def __init__(self, form):

1163 self.form = form

1164

1165 def __repr__(self):

1166 return '<%s for form %s>' % (

1167 self.__class__.__name__,

1168 self.form._name())

1169

1170 ## FIXME: there should be more methods, and it's unclear if this is

1171 ## a dictionary-like object or list-like object

1172

1173 def __getitem__(self, name):

1174 fields = [field for field in self if field.name == name]

1175 if not fields:

1176 raise KeyError("No input element with the name %r" % name)

1177

1178 input_type = fields[0].get('type')

1179 if input_type == 'radio' and len(fields) > 1:

1180 group = RadioGroup(fields)

1181 group.name = name

1182 return group

1183 elif input_type == 'checkbox' and len(fields) > 1:

1184 group = CheckboxGroup(fields)

1185 group.name = name

1186 return group

1187 else:

1188 # I don't like throwing away elements like this

1189 return fields[0]

1190

1191 def __contains__(self, name):

1192 for field in self:

1193 if field.name == name:

1194 return True

1195 return False

1196

1197 def keys(self):

1198 """

1199 Returns all unique field names, in document order.

1200

1201 :return: A list of all unique field names.

1202 """

1203 names = []

1204 seen = {None}

1205 for el in self:

1206 name = el.name

1207 if name not in seen:

1208 names.append(name)

1209 seen.add(name)

1210 return names

1211

1212 def items(self):

1213 """

1214 Returns all fields with their names, similar to dict.items().

1215

1216 :return: A list of (name, field) tuples.

1217 """

1218 items = []

1219 seen = set()

1220 for el in self:

1221 name = el.name

1222 if name not in seen:

1223 seen.add(name)

1224 items.append((name, self[name]))

1225 return items

1226

1227 def __iter__(self):

1228 return self.form.iter('select', 'input', 'textarea')

1229

1230 def __len__(self):

1231 return sum(1 for _ in self)

1232

1233

1234class InputMixin:

1235 """

1236 Mix-in for all input elements (input, select, and textarea)

1237 """

1238 @property

1239 def name(self):

1240 """

1241 Get/set the name of the element

1242 """

1243 return self.get('name')

1244

1245 @name.setter

1246 def name(self, value):

1247 self.set('name', value)

1248

1249 @name.deleter

1250 def name(self):

1251 attrib = self.attrib

1252 if 'name' in attrib:

1253 del attrib['name']

1254

1255 def __repr__(self):

1256 type_name = getattr(self, 'type', None)

1257 if type_name:

1258 type_name = ' type=%r' % type_name

1259 else:

1260 type_name = ''

1261 return '<%s %x name=%r%s>' % (

1262 self.__class__.__name__, id(self), self.name, type_name)

1263

1264

1265class TextareaElement(InputMixin, HtmlElement):

1266 """

1267 ``<textarea>`` element. You can get the name with ``.name`` and

1268 get/set the value with ``.value``

1269 """

1270 @property

1271 def value(self):

1272 """

1273 Get/set the value (which is the contents of this element)

1274 """

1275 content = self.text or ''

1276 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):

1277 serialisation_method = 'xml'

1278 else:

1279 serialisation_method = 'html'

1280 for el in self:

1281 # it's rare that we actually get here, so let's not use ''.join()

1282 content += etree.tostring(

1283 el, method=serialisation_method, encoding='unicode')

1284 return content

1285

1286 @value.setter

1287 def value(self, value):

1288 del self[:]

1289 self.text = value

1290

1291 @value.deleter

1292 def value(self):

1293 self.text = ''

1294 del self[:]

1295

1296

1297HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1298

1299

1300class SelectElement(InputMixin, HtmlElement):

1301 """

1302 ``<select>`` element. You can get the name with ``.name``.

1303

1304 ``.value`` will be the value of the selected option, unless this

1305 is a multi-select element (``<select multiple>``), in which case

1306 it will be a set-like object. In either case ``.value_options``

1307 gives the possible values.

1308

1309 The boolean attribute ``.multiple`` shows if this is a

1310 multi-select.

1311 """

1312 @property

1313 def value(self):

1314 """

1315 Get/set the value of this select (the selected option).

1316

1317 If this is a multi-select, this is a set-like object that

1318 represents all the selected options.

1319 """

1320 if self.multiple:

1321 return MultipleSelectOptions(self)

1322 options = _options_xpath(self)

1323

1324 try:

1325 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)

1326 except StopIteration:

1327 try:

1328 selected_option = next(el for el in options if el.get('disabled') is None)

1329 except StopIteration:

1330 return None

1331 value = selected_option.get('value')

1332 if value is None:

1333 value = (selected_option.text or '').strip()

1334 return value

1335

1336 @value.setter

1337 def value(self, value):

1338 if self.multiple:

1339 if isinstance(value, str):

1340 raise TypeError("You must pass in a sequence")

1341 values = self.value

1342 values.clear()

1343 values.update(value)

1344 return

1345 checked_option = None

1346 if value is not None:

1347 for el in _options_xpath(self):

1348 opt_value = el.get('value')

1349 if opt_value is None:

1350 opt_value = (el.text or '').strip()

1351 if opt_value == value:

1352 checked_option = el

1353 break

1354 else:

1355 raise ValueError(

1356 "There is no option with the value of %r" % value)

1357 for el in _options_xpath(self):

1358 if 'selected' in el.attrib:

1359 del el.attrib['selected']

1360 if checked_option is not None:

1361 checked_option.set('selected', '')

1362

1363 @value.deleter

1364 def value(self):

1365 # FIXME: should del be allowed at all?

1366 if self.multiple:

1367 self.value.clear()

1368 else:

1369 self.value = None

1370

1371 @property

1372 def value_options(self):

1373 """

1374 All the possible values this select can have (the ``value``

1375 attribute of all the ``<option>`` elements.

1376 """

1377 options = []

1378 for el in _options_xpath(self):

1379 value = el.get('value')

1380 if value is None:

1381 value = (el.text or '').strip()

1382 options.append(value)

1383 return options

1384

1385 @property

1386 def multiple(self):

1387 """

1388 Boolean attribute: is there a ``multiple`` attribute on this element.

1389 """

1390 return 'multiple' in self.attrib

1391

1392 @multiple.setter

1393 def multiple(self, value):

1394 if value:

1395 self.set('multiple', '')

1396 elif 'multiple' in self.attrib:

1397 del self.attrib['multiple']

1398

1399

1400HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1401

1402

1403class MultipleSelectOptions(SetMixin):

1404 """

1405 Represents all the selected options in a ``<select multiple>`` element.

1406

1407 You can add to this set-like option to select an option, or remove

1408 to unselect the option.

1409 """

1410

1411 def __init__(self, select):

1412 self.select = select

1413

1414 @property

1415 def options(self):

1416 """

1417 Iterator of all the ``<option>`` elements.

1418 """

1419 return iter(_options_xpath(self.select))

1420

1421 def __iter__(self):

1422 for option in self.options:

1423 if 'selected' in option.attrib:

1424 opt_value = option.get('value')

1425 if opt_value is None:

1426 opt_value = (option.text or '').strip()

1427 yield opt_value

1428

1429 def add(self, item):

1430 for option in self.options:

1431 opt_value = option.get('value')

1432 if opt_value is None:

1433 opt_value = (option.text or '').strip()

1434 if opt_value == item:

1435 option.set('selected', '')

1436 break

1437 else:

1438 raise ValueError(

1439 "There is no option with the value %r" % item)

1440

1441 def remove(self, item):

1442 for option in self.options:

1443 opt_value = option.get('value')

1444 if opt_value is None:

1445 opt_value = (option.text or '').strip()

1446 if opt_value == item:

1447 if 'selected' in option.attrib:

1448 del option.attrib['selected']

1449 else:

1450 raise ValueError(

1451 "The option %r is not currently selected" % item)

1452 break

1453 else:

1454 raise ValueError(

1455 "There is not option with the value %r" % item)

1456

1457 def __repr__(self):

1458 return '<%s {%s} for select name=%r>' % (

1459 self.__class__.__name__,

1460 ', '.join([repr(v) for v in self]),

1461 self.select.name)

1462

1463

1464class RadioGroup(list):

1465 """

1466 This object represents several ``<input type=radio>`` elements

1467 that have the same name.

1468

1469 You can use this like a list, but also use the property

1470 ``.value`` to check/uncheck inputs. Also you can use

1471 ``.value_options`` to get the possible values.

1472 """

1473 @property

1474 def value(self):

1475 """

1476 Get/set the value, which checks the radio with that value (and

1477 unchecks any other value).

1478 """

1479 for el in self:

1480 if 'checked' in el.attrib:

1481 return el.get('value')

1482 return None

1483

1484 @value.setter

1485 def value(self, value):

1486 checked_option = None

1487 if value is not None:

1488 for el in self:

1489 if el.get('value') == value:

1490 checked_option = el

1491 break

1492 else:

1493 raise ValueError("There is no radio input with the value %r" % value)

1494 for el in self:

1495 if 'checked' in el.attrib:

1496 del el.attrib['checked']

1497 if checked_option is not None:

1498 checked_option.set('checked', '')

1499

1500 @value.deleter

1501 def value(self):

1502 self.value = None

1503

1504 @property

1505 def value_options(self):

1506 """

1507 Returns a list of all the possible values.

1508 """

1509 return [el.get('value') for el in self]

1510

1511 def __repr__(self):

1512 return '%s(%s)' % (

1513 self.__class__.__name__,

1514 list.__repr__(self))

1515

1516

1517class CheckboxGroup(list):

1518 """

1519 Represents a group of checkboxes (``<input type=checkbox>``) that

1520 have the same name.

1521

1522 In addition to using this like a list, the ``.value`` attribute

1523 returns a set-like object that you can add to or remove from to

1524 check and uncheck checkboxes. You can also use ``.value_options``

1525 to get the possible values.

1526 """

1527 @property

1528 def value(self):

1529 """

1530 Return a set-like object that can be modified to check or

1531 uncheck individual checkboxes according to their value.

1532 """

1533 return CheckboxValues(self)

1534

1535 @value.setter

1536 def value(self, value):

1537 values = self.value

1538 values.clear()

1539 if not hasattr(value, '__iter__'):

1540 raise ValueError(

1541 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"

1542 % (self[0].name, value))

1543 values.update(value)

1544

1545 @value.deleter

1546 def value(self):

1547 self.value.clear()

1548

1549 @property

1550 def value_options(self):

1551 """

1552 Returns a list of all the possible values.

1553 """

1554 return [el.get('value') for el in self]

1555

1556 def __repr__(self):

1557 return '%s(%s)' % (

1558 self.__class__.__name__, list.__repr__(self))

1559

1560

1561class CheckboxValues(SetMixin):

1562 """

1563 Represents the values of the checked checkboxes in a group of

1564 checkboxes with the same name.

1565 """

1566

1567 def __init__(self, group):

1568 self.group = group

1569

1570 def __iter__(self):

1571 return iter([

1572 el.get('value')

1573 for el in self.group

1574 if 'checked' in el.attrib])

1575

1576 def add(self, value):

1577 for el in self.group:

1578 if el.get('value') == value:

1579 el.set('checked', '')

1580 break

1581 else:

1582 raise KeyError("No checkbox with value %r" % value)

1583

1584 def remove(self, value):

1585 for el in self.group:

1586 if el.get('value') == value:

1587 if 'checked' in el.attrib:

1588 del el.attrib['checked']

1589 else:

1590 raise KeyError(

1591 "The checkbox with value %r was already unchecked" % value)

1592 break

1593 else:

1594 raise KeyError(

1595 "No checkbox with value %r" % value)

1596

1597 def __repr__(self):

1598 return '<%s {%s} for checkboxes name=%r>' % (

1599 self.__class__.__name__,

1600 ', '.join([repr(v) for v in self]),

1601 self.group.name)

1602

1603

1604class InputElement(InputMixin, HtmlElement):

1605 """

1606 Represents an ``<input>`` element.

1607

1608 You can get the type with ``.type`` (which is lower-cased and

1609 defaults to ``'text'``).

1610

1611 Also you can get and set the value with ``.value``

1612

1613 Checkboxes and radios have the attribute ``input.checkable ==

1614 True`` (for all others it is false) and a boolean attribute

1615 ``.checked``.

1616

1617 """

1618

1619 ## FIXME: I'm a little uncomfortable with the use of .checked

1620 @property

1621 def value(self):

1622 """

1623 Get/set the value of this element, using the ``value`` attribute.

1624

1625 Also, if this is a checkbox and it has no value, this defaults

1626 to ``'on'``. If it is a checkbox or radio that is not

1627 checked, this returns None.

1628 """

1629 if self.checkable:

1630 if self.checked:

1631 return self.get('value') or 'on'

1632 else:

1633 return None

1634 return self.get('value')

1635

1636 @value.setter

1637 def value(self, value):

1638 if self.checkable:

1639 if not value:

1640 self.checked = False

1641 else:

1642 self.checked = True

1643 if isinstance(value, str):

1644 self.set('value', value)

1645 else:

1646 self.set('value', value)

1647

1648 @value.deleter

1649 def value(self):

1650 if self.checkable:

1651 self.checked = False

1652 else:

1653 if 'value' in self.attrib:

1654 del self.attrib['value']

1655

1656 @property

1657 def type(self):

1658 """

1659 Return the type of this element (using the type attribute).

1660 """

1661 return self.get('type', 'text').lower()

1662

1663 @type.setter

1664 def type(self, value):

1665 self.set('type', value)

1666

1667 @property

1668 def checkable(self):

1669 """

1670 Boolean: can this element be checked?

1671 """

1672 return self.type in ('checkbox', 'radio')

1673

1674 @property

1675 def checked(self):

1676 """

1677 Boolean attribute to get/set the presence of the ``checked``

1678 attribute.

1679

1680 You can only use this on checkable input types.

1681 """

1682 if not self.checkable:

1683 raise AttributeError('Not a checkable input type')

1684 return 'checked' in self.attrib

1685

1686 @checked.setter

1687 def checked(self, value):

1688 if not self.checkable:

1689 raise AttributeError('Not a checkable input type')

1690 if value:

1691 self.set('checked', '')

1692 else:

1693 attrib = self.attrib

1694 if 'checked' in attrib:

1695 del attrib['checked']

1696

1697

1698HtmlElementClassLookup._default_element_classes['input'] = InputElement

1699

1700

1701class LabelElement(HtmlElement):

1702 """

1703 Represents a ``<label>`` element.

1704

1705 Label elements are linked to other elements with their ``for``

1706 attribute. You can access this element with ``label.for_element``.

1707 """

1708 @property

1709 def for_element(self):

1710 """

1711 Get/set the element this label points to. Return None if it

1712 can't be found.

1713 """

1714 id = self.get('for')

1715 if not id:

1716 return None

1717 return self.body.get_element_by_id(id)

1718

1719 @for_element.setter

1720 def for_element(self, other):

1721 id = other.get('id')

1722 if not id:

1723 raise TypeError(

1724 "Element %r has no id attribute" % other)

1725 self.set('for', id)

1726

1727 @for_element.deleter

1728 def for_element(self):

1729 attrib = self.attrib

1730 if 'id' in attrib:

1731 del attrib['id']

1732

1733

1734HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1735

1736

1737############################################################

1738## Serialization

1739############################################################

1740

1741def html_to_xhtml(html):

1742 """Convert all tags in an HTML tree to XHTML by moving them to the

1743 XHTML namespace.

1744 """

1745 try:

1746 html = html.getroot()

1747 except AttributeError:

1748 pass

1749 prefix = "{%s}" % XHTML_NAMESPACE

1750 for el in html.iter(etree.Element):

1751 tag = el.tag

1752 if tag[0] != '{':

1753 el.tag = prefix + tag

1754

1755

1756def xhtml_to_html(xhtml):

1757 """Convert all tags in an XHTML tree to HTML by removing their

1758 XHTML namespace.

1759 """

1760 try:

1761 xhtml = xhtml.getroot()

1762 except AttributeError:

1763 pass

1764 prefix = "{%s}" % XHTML_NAMESPACE

1765 prefix_len = len(prefix)

1766 for el in xhtml.iter(prefix + "*"):

1767 el.tag = el.tag[prefix_len:]

1768

1769

1770# This isn't a general match, but it's a match for what libxml2

1771# specifically serialises:

1772__str_replace_meta_content_type = re.compile(

1773 r'<meta http-equiv="Content-Type"[^>]*>').sub

1774__bytes_replace_meta_content_type = re.compile(

1775 br'<meta http-equiv="Content-Type"[^>]*>').sub

1776

1777

1778def tostring(doc, pretty_print=False, include_meta_content_type=False,

1779 encoding=None, method="html", with_tail=True, doctype=None):

1780 """Return an HTML string representation of the document.

1781

1782 Note: if include_meta_content_type is true this will create a

1783 ``<meta http-equiv="Content-Type" ...>`` tag in the head;

1784 regardless of the value of include_meta_content_type any existing

1785 ``<meta http-equiv="Content-Type" ...>`` tag will be removed

1786

1787 The ``encoding`` argument controls the output encoding (defaults to

1788 ASCII, with &#...; character references for any characters outside

1789 of ASCII). Note that you can pass the name ``'unicode'`` as

1790 ``encoding`` argument to serialise to a Unicode string.

1791

1792 The ``method`` argument defines the output method. It defaults to

1793 'html', but can also be 'xml' for xhtml output, or 'text' to

1794 serialise to plain text without markup.

1795

1796 To leave out the tail text of the top-level element that is being

1797 serialised, pass ``with_tail=False``.

1798

1799 The ``doctype`` option allows passing in a plain string that will

1800 be serialised before the XML tree. Note that passing in non

1801 well-formed content here will make the XML output non well-formed.

1802 Also, an existing doctype in the document tree will not be removed

1803 when serialising an ElementTree instance.

1804

1805 Example::

1806

1807 >>> from lxml import html

1808 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')

1809

1810 >>> html.tostring(root)

1811 b'<p>Hello<br>world!</p>'

1812 >>> html.tostring(root, method='html')

1813 b'<p>Hello<br>world!</p>'

1814

1815 >>> html.tostring(root, method='xml')

1816 b'<p>Hello<br/>world!</p>'

1817

1818 >>> html.tostring(root, method='text')

1819 b'Helloworld!'

1820

1821 >>> html.tostring(root, method='text', encoding='unicode')

1822 u'Helloworld!'

1823

1824 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')

1825 >>> html.tostring(root[0], method='text', encoding='unicode')

1826 u'Helloworld!TAIL'

1827

1828 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)

1829 u'Helloworld!'

1830

1831 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')

1832 >>> html.tostring(doc, method='html', encoding='unicode')

1833 u'<html><body><p>Hello<br>world!</p></body></html>'

1834

1835 >>> print(html.tostring(doc, method='html', encoding='unicode',

1836 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'

1837 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))

1838 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">

1839 <html><body><p>Hello<br>world!</p></body></html>

1840 """

1841 html = etree.tostring(doc, method=method, pretty_print=pretty_print,

1842 encoding=encoding, with_tail=with_tail,

1843 doctype=doctype)

1844 if method == 'html' and not include_meta_content_type:

1845 if isinstance(html, str):

1846 html = __str_replace_meta_content_type('', html)

1847 else:

1848 html = __bytes_replace_meta_content_type(b'', html)

1849 return html

1850

1851

1852tostring.__doc__ = __fix_docstring(tostring.__doc__)

1853

1854

1855def open_in_browser(doc, encoding=None):

1856 """

1857 Open the HTML document in a web browser, saving it to a temporary

1858 file to open it. Note that this does not delete the file after

1859 use. This is mainly meant for debugging.

1860 """

1861 import os

1862 import webbrowser

1863 import tempfile

1864 if not isinstance(doc, etree._ElementTree):

1865 doc = etree.ElementTree(doc)

1866 handle, fn = tempfile.mkstemp(suffix='.html')

1867 f = os.fdopen(handle, 'wb')

1868 try:

1869 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")

1870 finally:

1871 # we leak the file itself here, but we should at least close it

1872 f.close()

1873 url = 'file://' + fn.replace(os.path.sep, '/')

1874 print(url)

1875 webbrowser.open(url)

1876

1877

1878################################################################################

1879# configure Element class lookup

1880################################################################################

1881

1882class HTMLParser(etree.HTMLParser):

1883 """An HTML parser that is configured to return lxml.html Element

1884 objects.

1885 """

1886 def __init__(self, **kwargs):

1887 super().__init__(**kwargs)

1888 self.set_element_class_lookup(HtmlElementClassLookup())

1889

1890

1891class XHTMLParser(etree.XMLParser):

1892 """An XML parser that is configured to return lxml.html Element

1893 objects.

1894

1895 Note that this parser is not really XHTML aware unless you let it

1896 load a DTD that declares the HTML entities. To do this, make sure

1897 you have the XHTML DTDs installed in your catalogs, and create the

1898 parser like this::

1899

1900 >>> parser = XHTMLParser(load_dtd=True)

1901

1902 If you additionally want to validate the document, use this::

1903

1904 >>> parser = XHTMLParser(dtd_validation=True)

1905

1906 For catalog support, see http://www.xmlsoft.org/catalog.html.

1907 """

1908 def __init__(self, **kwargs):

1909 super().__init__(**kwargs)

1910 self.set_element_class_lookup(HtmlElementClassLookup())

1911

1912

1913def Element(*args, **kw):

1914 """Create a new HTML Element.

1915

1916 This can also be used for XHTML documents.

1917 """

1918 v = html_parser.makeelement(*args, **kw)

1919 return v

1920

1921

1922html_parser = HTMLParser()

1923xhtml_parser = XHTMLParser()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/lxml/html/init.py: 2%

968 statements