Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lxml/html/__init_

3# Redistribution and use in source and binary forms, with or without

4# modification, are permitted provided that the following conditions are

5# met:

7# 1. Redistributions of source code must retain the above copyright

8# notice, this list of conditions and the following disclaimer.

10# 2. Redistributions in binary form must reproduce the above copyright

11# notice, this list of conditions and the following disclaimer in

12# the documentation and/or other materials provided with the

13# distribution.

14#

15# 3. Neither the name of Ian Bicking nor the names of its contributors may

16# be used to endorse or promote products derived from this software

17# without specific prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR

23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

31"""The ``lxml.html`` tool set for HTML handling.

32"""

35__all__ = [

36 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',

37 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',

38 'find_rel_links', 'find_class', 'make_links_absolute',

39 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']

42import copy

43import re

45from collections.abc import MutableMapping, MutableSet

46from functools import partial

47from urllib.parse import urljoin

49from .. import etree

50from . import defs

51from ._setmixin import SetMixin

54def __fix_docstring(s):

55 # TODO: remove and clean up doctests

56 if not s:

57 return s

58 sub = re.compile(r"^(\s*)u'", re.M).sub

59 return sub(r"\1'", s)

62XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

64_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",

65 namespaces={'x':XHTML_NAMESPACE})

66_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",

67 namespaces={'x':XHTML_NAMESPACE})

68_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",

69 namespaces={'x':XHTML_NAMESPACE})

70#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})

71_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")

72_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")

73_collect_string_content = etree.XPath("string()", smart_strings=False)

74_iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer

75_iter_css_imports = re.compile(r'@import "(.*?)"').finditer

76_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",

77 namespaces={'x':XHTML_NAMESPACE})

78_archive_re = re.compile(r'[^ ]+')

79_parse_meta_refresh_url = re.compile(

80 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

83def _unquote_match(s, pos):

84 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":

85 return s[1:-1], pos+1

86 else:

87 return s,pos

90def _transform_result(typ, result):

91 """Convert the result back into the input type.

92 """

93 if issubclass(typ, bytes):

94 return tostring(result, encoding='utf-8')

95 elif issubclass(typ, str):

96 return tostring(result, encoding='unicode')

97 else:

98 return result

100

101def _nons(tag):

102 if isinstance(tag, str):

103 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:

104 return tag.split('}')[-1]

105 return tag

106

107

108class Classes(MutableSet):

109 """Provides access to an element's class attribute as a set-like collection.

110 Usage::

111

112 >>> el = fromstring('<p class="hidden large">Text</p>')

113 >>> classes = el.classes # or: classes = Classes(el.attrib)

114 >>> classes |= ['block', 'paragraph']

115 >>> el.get('class')

116 'hidden large block paragraph'

117 >>> classes.toggle('hidden')

118 False

119 >>> el.get('class')

120 'large block paragraph'

121 >>> classes -= ('some', 'classes', 'block')

122 >>> el.get('class')

123 'large paragraph'

124 """

125 def __init__(self, attributes):

126 self._attributes = attributes

127 self._get_class_value = partial(attributes.get, 'class', '')

128

129 def add(self, value):

130 """

131 Add a class.

132

133 This has no effect if the class is already present.

134 """

135 if not value or re.search(r'\s', value):

136 raise ValueError("Invalid class name: %r" % value)

137 classes = self._get_class_value().split()

138 if value in classes:

139 return

140 classes.append(value)

141 self._attributes['class'] = ' '.join(classes)

142

143 def discard(self, value):

144 """

145 Remove a class if it is currently present.

146

147 If the class is not present, do nothing.

148 """

149 if not value or re.search(r'\s', value):

150 raise ValueError("Invalid class name: %r" % value)

151 classes = [name for name in self._get_class_value().split()

152 if name != value]

153 if classes:

154 self._attributes['class'] = ' '.join(classes)

155 elif 'class' in self._attributes:

156 del self._attributes['class']

157

158 def remove(self, value):

159 """

160 Remove a class; it must currently be present.

161

162 If the class is not present, raise a KeyError.

163 """

164 if not value or re.search(r'\s', value):

165 raise ValueError("Invalid class name: %r" % value)

166 super().remove(value)

167

168 def __contains__(self, name):

169 classes = self._get_class_value()

170 return name in classes and name in classes.split()

171

172 def __iter__(self):

173 return iter(self._get_class_value().split())

174

175 def __len__(self):

176 return len(self._get_class_value().split())

177

178 # non-standard methods

179

180 def update(self, values):

181 """

182 Add all names from 'values'.

183 """

184 classes = self._get_class_value().split()

185 extended = False

186 for value in values:

187 if value not in classes:

188 classes.append(value)

189 extended = True

190 if extended:

191 self._attributes['class'] = ' '.join(classes)

192

193 def toggle(self, value):

194 """

195 Add a class name if it isn't there yet, or remove it if it exists.

196

197 Returns true if the class was added (and is now enabled) and

198 false if it was removed (and is now disabled).

199 """

200 if not value or re.search(r'\s', value):

201 raise ValueError("Invalid class name: %r" % value)

202 classes = self._get_class_value().split()

203 try:

204 classes.remove(value)

205 enabled = False

206 except ValueError:

207 classes.append(value)

208 enabled = True

209 if classes:

210 self._attributes['class'] = ' '.join(classes)

211 else:

212 del self._attributes['class']

213 return enabled

214

215

216class HtmlMixin:

217

218 def set(self, key, value=None):

219 """set(self, key, value=None)

220

221 Sets an element attribute. If no value is provided, or if the value is None,

222 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"

223 for ``form.set('novalidate')``.

224 """

225 super().set(key, value)

226

227 @property

228 def classes(self):

229 """

230 A set-like wrapper around the 'class' attribute.

231 """

232 return Classes(self.attrib)

233

234 @classes.setter

235 def classes(self, classes):

236 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.

237 value = classes._get_class_value()

238 if value:

239 self.set('class', value)

240 elif self.get('class') is not None:

241 del self.attrib['class']

242

243 @property

244 def base_url(self):

245 """

246 Returns the base URL, given when the page was parsed.

247

248 Use with ``urlparse.urljoin(el.base_url, href)`` to get

249 absolute URLs.

250 """

251 return self.getroottree().docinfo.URL

252

253 @property

254 def forms(self):

255 """

256 Return a list of all the forms

257 """

258 return _forms_xpath(self)

259

260 @property

261 def body(self):

262 """

263 Return the <body> element. Can be called from a child element

264 to get the document's head.

265 """

266 for element in self.getroottree().iter("body", f"{{{XHTML_NAMESPACE}}}body"):

267 return element

268 return None

269

270 @property

271 def head(self):

272 """

273 Returns the <head> element. Can be called from a child

274 element to get the document's head.

275 """

276 for element in self.getroottree().iter("head", f"{{{XHTML_NAMESPACE}}}head"):

277 return element

278 return None

279

280 @property

281 def label(self):

282 """

283 Get or set any <label> element associated with this element.

284 """

285 id = self.get('id')

286 if not id:

287 return None

288 result = _label_xpath(self, id=id)

289 if not result:

290 return None

291 else:

292 return result[0]

293

294 @label.setter

295 def label(self, label):

296 id = self.get('id')

297 if not id:

298 raise TypeError(

299 "You cannot set a label for an element (%r) that has no id"

300 % self)

301 if _nons(label.tag) != 'label':

302 raise TypeError(

303 "You can only assign label to a label element (not %r)"

304 % label)

305 label.set('for', id)

306

307 @label.deleter

308 def label(self):

309 label = self.label

310 if label is not None:

311 del label.attrib['for']

312

313 def drop_tree(self):

314 """

315 Removes this element from the tree, including its children and

316 text. The tail text is joined to the previous element or

317 parent.

318 """

319 parent = self.getparent()

320 assert parent is not None

321 if self.tail:

322 previous = self.getprevious()

323 if previous is None:

324 parent.text = (parent.text or '') + self.tail

325 else:

326 previous.tail = (previous.tail or '') + self.tail

327 parent.remove(self)

328

329 def drop_tag(self):

330 """

331 Remove the tag, but not its children or text. The children and text

332 are merged into the parent.

333

334 Example::

335

336 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')

337 >>> h.find('.//b').drop_tag()

338 >>> print(tostring(h, encoding='unicode'))

339 <div>Hello World!</div>

340 """

341 parent = self.getparent()

342 assert parent is not None

343 previous = self.getprevious()

344 if self.text and isinstance(self.tag, str):

345 # not a Comment, etc.

346 if previous is None:

347 parent.text = (parent.text or '') + self.text

348 else:

349 previous.tail = (previous.tail or '') + self.text

350 if self.tail:

351 if len(self):

352 last = self[-1]

353 last.tail = (last.tail or '') + self.tail

354 elif previous is None:

355 parent.text = (parent.text or '') + self.tail

356 else:

357 previous.tail = (previous.tail or '') + self.tail

358 index = parent.index(self)

359 parent[index:index+1] = self[:]

360

361 def find_rel_links(self, rel):

362 """

363 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.

364 """

365 rel = rel.lower()

366 return [el for el in _rel_links_xpath(self)

367 if el.get('rel').lower() == rel]

368

369 def find_class(self, class_name):

370 """

371 Find any elements with the given class name.

372 """

373 return _class_xpath(self, class_name=class_name)

374

375 def get_element_by_id(self, id, *default):

376 """

377 Get the first element in a document with the given id. If none is

378 found, return the default argument if provided or raise KeyError

379 otherwise.

380

381 Note that there can be more than one element with the same id,

382 and this isn't uncommon in HTML documents found in the wild.

383 Browsers return only the first match, and this function does

384 the same.

385 """

386 try:

387 # FIXME: should this check for multiple matches?

388 # browsers just return the first one

389 return _id_xpath(self, id=id)[0]

390 except IndexError:

391 if default:

392 return default[0]

393 else:

394 raise KeyError(id)

395

396 def text_content(self):

397 """

398 Return the text content of the tag (and the text in any children).

399 """

400 return _collect_string_content(self)

401

402 def cssselect(self, expr, translator='html'):

403 """

404 Run the CSS expression on this element and its children,

405 returning a list of the results.

406

407 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)

408 -- note that pre-compiling the expression can provide a substantial

409 speedup.

410 """

411 # Do the import here to make the dependency optional.

412 from lxml.cssselect import CSSSelector

413 return CSSSelector(expr, translator=translator)(self)

414

415 ########################################

416 ## Link functions

417 ########################################

418

419 def make_links_absolute(self, base_url=None, resolve_base_href=True,

420 handle_failures=None):

421 """

422 Make all links in the document absolute, given the

423 ``base_url`` for the document (the full URL where the document

424 came from), or if no ``base_url`` is given, then the ``.base_url``

425 of the document.

426

427 If ``resolve_base_href`` is true, then any ``<base href>``

428 tags in the document are used *and* removed from the document.

429 If it is false then any such tag is ignored.

430

431 If ``handle_failures`` is None (default), a failure to process

432 a URL will abort the processing. If set to 'ignore', errors

433 are ignored. If set to 'discard', failing URLs will be removed.

434 """

435 if base_url is None:

436 base_url = self.base_url

437 if base_url is None:

438 raise TypeError(

439 "No base_url given, and the document has no base_url")

440 if resolve_base_href:

441 self.resolve_base_href()

442

443 if handle_failures == 'ignore':

444 def link_repl(href):

445 try:

446 return urljoin(base_url, href)

447 except ValueError:

448 return href

449 elif handle_failures == 'discard':

450 def link_repl(href):

451 try:

452 return urljoin(base_url, href)

453 except ValueError:

454 return None

455 elif handle_failures is None:

456 def link_repl(href):

457 return urljoin(base_url, href)

458 else:

459 raise ValueError(

460 "unexpected value for handle_failures: %r" % handle_failures)

461

462 self.rewrite_links(link_repl)

463

464 def resolve_base_href(self, handle_failures=None):

465 """

466 Find any ``<base href>`` tag in the document, and apply its

467 values to all links found in the document. Also remove the

468 tag once it has been applied.

469

470 If ``handle_failures`` is None (default), a failure to process

471 a URL will abort the processing. If set to 'ignore', errors

472 are ignored. If set to 'discard', failing URLs will be removed.

473 """

474 base_href = None

475 basetags = self.xpath('//base[@href]|//x:base[@href]',

476 namespaces={'x': XHTML_NAMESPACE})

477 for b in basetags:

478 base_href = b.get('href')

479 b.drop_tree()

480 if not base_href:

481 return

482 self.make_links_absolute(base_href, resolve_base_href=False,

483 handle_failures=handle_failures)

484

485 def iterlinks(self):

486 """

487 Yield (element, attribute, link, pos), where attribute may be None

488 (indicating the link is in the text). ``pos`` is the position

489 where the link occurs; often 0, but sometimes something else in

490 the case of links in stylesheets or style tags.

491

492 Note: <base href> is *not* taken into account in any way. The

493 link you get is exactly the link in the document.

494

495 Note: multiple links inside of a single text string or

496 attribute value are returned in reversed order. This makes it

497 possible to replace or delete them from the text string value

498 based on their reported text positions. Otherwise, a

499 modification at one text position can change the positions of

500 links reported later on.

501 """

502 link_attrs = defs.link_attrs

503 for el in self.iter(etree.Element):

504 attribs = el.attrib

505 tag = _nons(el.tag)

506 if tag == 'object':

507 codebase = None

508 ## <object> tags have attributes that are relative to

509 ## codebase

510 if 'codebase' in attribs:

511 codebase = el.get('codebase')

512 yield (el, 'codebase', codebase, 0)

513 for attrib in ('classid', 'data'):

514 if attrib in attribs:

515 value = el.get(attrib)

516 if codebase is not None:

517 value = urljoin(codebase, value)

518 yield (el, attrib, value, 0)

519 if 'archive' in attribs:

520 for match in _archive_re.finditer(el.get('archive')):

521 value = match.group(0)

522 if codebase is not None:

523 value = urljoin(codebase, value)

524 yield (el, 'archive', value, match.start())

525 else:

526 for attrib in link_attrs:

527 if attrib in attribs:

528 yield (el, attrib, attribs[attrib], 0)

529 if tag == 'meta':

530 http_equiv = attribs.get('http-equiv', '').lower()

531 if http_equiv == 'refresh':

532 content = attribs.get('content', '')

533 match = _parse_meta_refresh_url(content)

534 url = (match.group('url') if match else content).strip()

535 # unexpected content means the redirect won't work, but we might

536 # as well be permissive and return the entire string.

537 if url:

538 url, pos = _unquote_match(

539 url, match.start('url') if match else content.find(url))

540 yield (el, 'content', url, pos)

541 elif tag == 'param':

542 valuetype = el.get('valuetype') or ''

543 if valuetype.lower() == 'ref':

544 ## FIXME: while it's fine we *find* this link,

545 ## according to the spec we aren't supposed to

546 ## actually change the value, including resolving

547 ## it. It can also still be a link, even if it

548 ## doesn't have a valuetype="ref" (which seems to be the norm)

549 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype

550 yield (el, 'value', el.get('value'), 0)

551 elif tag == 'style' and el.text:

552 urls = [

553 # (start_pos, url)

554 _unquote_match(match.group(1), match.start(1))[::-1]

555 for match in _iter_css_urls(el.text)

556 ] + [

557 (match.start(1), match.group(1))

558 for match in _iter_css_imports(el.text)

559 ]

560 if urls:

561 # sort by start pos to bring both match sets back into order

562 # and reverse the list to report correct positions despite

563 # modifications

564 urls.sort(reverse=True)

565 for start, url in urls:

566 yield (el, None, url, start)

567 if 'style' in attribs:

568 urls = list(_iter_css_urls(attribs['style']))

569 if urls:

570 # return in reversed order to simplify in-place modifications

571 for match in urls[::-1]:

572 url, start = _unquote_match(match.group(1), match.start(1))

573 yield (el, 'style', url, start)

574

575 def rewrite_links(self, link_repl_func, resolve_base_href=True,

576 base_href=None):

577 """

578 Rewrite all the links in the document. For each link

579 ``link_repl_func(link)`` will be called, and the return value

580 will replace the old link.

581

582 Note that links may not be absolute (unless you first called

583 ``make_links_absolute()``), and may be internal (e.g.,

584 ``'#anchor'``). They can also be values like

585 ``'mailto:email'`` or ``'javascript:expr'``.

586

587 If you give ``base_href`` then all links passed to

588 ``link_repl_func()`` will take that into account.

589

590 If the ``link_repl_func`` returns None, the attribute or

591 tag text will be removed completely.

592 """

593 if base_href is not None:

594 # FIXME: this can be done in one pass with a wrapper

595 # around link_repl_func

596 self.make_links_absolute(

597 base_href, resolve_base_href=resolve_base_href)

598 elif resolve_base_href:

599 self.resolve_base_href()

600

601 for el, attrib, link, pos in self.iterlinks():

602 new_link = link_repl_func(link.strip())

603 if new_link == link:

604 continue

605 if new_link is None:

606 # Remove the attribute or element content

607 if attrib is None:

608 el.text = ''

609 else:

610 del el.attrib[attrib]

611 continue

612

613 if attrib is None:

614 new = el.text[:pos] + new_link + el.text[pos+len(link):]

615 el.text = new

616 else:

617 cur = el.get(attrib)

618 if not pos and len(cur) == len(link):

619 new = new_link # most common case

620 else:

621 new = cur[:pos] + new_link + cur[pos+len(link):]

622 el.set(attrib, new)

623

624

625class _MethodFunc:

626 """

627 An object that represents a method on an element as a function;

628 the function takes either an element or an HTML string. It

629 returns whatever the function normally returns, or if the function

630 works in-place (and so returns None) it returns a serialized form

631 of the resulting document.

632 """

633 def __init__(self, name, copy=False, source_class=HtmlMixin):

634 self.name = name

635 self.copy = copy

636 self.__doc__ = getattr(source_class, self.name).__doc__

637 def __call__(self, doc, *args, **kw):

638 result_type = type(doc)

639 if isinstance(doc, (str, bytes)):

640 if 'copy' in kw:

641 raise TypeError(

642 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)

643 doc = fromstring(doc, **kw)

644 else:

645 if 'copy' in kw:

646 make_a_copy = kw.pop('copy')

647 else:

648 make_a_copy = self.copy

649 if make_a_copy:

650 doc = copy.deepcopy(doc)

651 meth = getattr(doc, self.name)

652 result = meth(*args, **kw)

653 # FIXME: this None test is a bit sloppy

654 if result is None:

655 # Then return what we got in

656 return _transform_result(result_type, doc)

657 else:

658 return result

659

660

661find_rel_links = _MethodFunc('find_rel_links', copy=False)

662find_class = _MethodFunc('find_class', copy=False)

663make_links_absolute = _MethodFunc('make_links_absolute', copy=True)

664resolve_base_href = _MethodFunc('resolve_base_href', copy=True)

665iterlinks = _MethodFunc('iterlinks', copy=False)

666rewrite_links = _MethodFunc('rewrite_links', copy=True)

667

668

669class HtmlComment(HtmlMixin, etree.CommentBase):

670 pass

671

672

673class HtmlElement(HtmlMixin, etree.ElementBase):

674 pass

675

676

677class HtmlProcessingInstruction(HtmlMixin, etree.PIBase):

678 pass

679

680

681class HtmlEntity(HtmlMixin, etree.EntityBase):

682 pass

683

684

685class HtmlElementClassLookup(etree.CustomElementClassLookup):

686 """A lookup scheme for HTML Element classes.

687

688 To create a lookup instance with different Element classes, pass a tag

689 name mapping of Element classes in the ``classes`` keyword argument and/or

690 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.

691 The special key '*' denotes a Mixin class that should be mixed into all

692 Element classes.

693 """

694 _default_element_classes = {}

695

696 def __init__(self, classes=None, mixins=None):

697 etree.CustomElementClassLookup.__init__(self)

698 if classes is None:

699 classes = self._default_element_classes.copy()

700 if mixins:

701 mixers = {}

702 for name, value in mixins:

703 if name == '*':

704 for n in classes.keys():

705 mixers.setdefault(n, []).append(value)

706 else:

707 mixers.setdefault(name, []).append(value)

708 for name, mix_bases in mixers.items():

709 cur = classes.get(name, HtmlElement)

710 bases = tuple(mix_bases + [cur])

711 classes[name] = type(cur.__name__, bases, {})

712 self._element_classes = classes

713

714 def lookup(self, node_type, document, namespace, name):

715 if node_type == 'element':

716 return self._element_classes.get(name.lower(), HtmlElement)

717 elif node_type == 'comment':

718 return HtmlComment

719 elif node_type == 'PI':

720 return HtmlProcessingInstruction

721 elif node_type == 'entity':

722 return HtmlEntity

723 # Otherwise normal lookup

724 return None

725

726

727################################################################################

728# parsing

729################################################################################

730

731_looks_like_full_html_unicode = re.compile(

732 r'^\s*<(?:html|!doctype)', re.I).match

733_looks_like_full_html_bytes = re.compile(

734 br'^\s*<(?:html|!doctype)', re.I).match

735

736

737def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

738 if parser is None:

739 parser = html_parser

740 value = etree.fromstring(html, parser, **kw)

741 if value is None:

742 raise etree.ParserError(

743 "Document is empty")

744 if ensure_head_body and value.find('head') is None:

745 value.insert(0, Element('head'))

746 if ensure_head_body and value.find('body') is None:

747 value.append(Element('body'))

748 return value

749

750

751def fragments_fromstring(html, no_leading_text=False, base_url=None,

752 parser=None, **kw):

753 """Parses several HTML elements, returning a list of elements.

754

755 The first item in the list may be a string.

756 If no_leading_text is true, then it will be an error if there is

757 leading text, and it will always be a list of only elements.

758

759 base_url will set the document's base_url attribute

760 (and the tree's docinfo.URL).

761 """

762 if parser is None:

763 parser = html_parser

764 # FIXME: check what happens when you give html with a body, head, etc.

765 if isinstance(html, bytes):

766 if not _looks_like_full_html_bytes(html):

767 # can't use %-formatting in early Py3 versions

768 html = (b'<html><body>' + html +

769 b'</body></html>')

770 else:

771 if not _looks_like_full_html_unicode(html):

772 html = '<html><body>%s</body></html>' % html

773 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

774 assert _nons(doc.tag) == 'html'

775 bodies = [e for e in doc if _nons(e.tag) == 'body']

776 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))

777 body = bodies[0]

778 elements = []

779 if no_leading_text and body.text and body.text.strip():

780 raise etree.ParserError(

781 "There is leading text: %r" % body.text)

782 if body.text and body.text.strip():

783 elements.append(body.text)

784 elements.extend(body)

785 # FIXME: removing the reference to the parent artificial document

786 # would be nice

787 return elements

788

789

790def fragment_fromstring(html, create_parent=False, base_url=None,

791 parser=None, **kw):

792 """

793 Parses a single HTML element; it is an error if there is more than

794 one element, or if anything but whitespace precedes or follows the

795 element.

796

797 If ``create_parent`` is true (or is a tag name) then a parent node

798 will be created to encapsulate the HTML in a single element. In this

799 case, leading or trailing text is also allowed, as are multiple elements

800 as result of the parsing.

801

802 Passing a ``base_url`` will set the document's ``base_url`` attribute

803 (and the tree's docinfo.URL).

804 """

805 if parser is None:

806 parser = html_parser

807

808 accept_leading_text = bool(create_parent)

809

810 elements = fragments_fromstring(

811 html, parser=parser, no_leading_text=not accept_leading_text,

812 base_url=base_url, **kw)

813

814 if create_parent:

815 if not isinstance(create_parent, str):

816 create_parent = 'div'

817 new_root = Element(create_parent)

818 if elements:

819 if isinstance(elements[0], str):

820 new_root.text = elements[0]

821 del elements[0]

822 new_root.extend(elements)

823 return new_root

824

825 if not elements:

826 raise etree.ParserError('No elements found')

827 if len(elements) > 1:

828 raise etree.ParserError(

829 "Multiple elements found (%s)"

830 % ', '.join([_element_name(e) for e in elements]))

831 el = elements[0]

832 if el.tail and el.tail.strip():

833 raise etree.ParserError(

834 "Element followed by text: %r" % el.tail)

835 el.tail = None

836 return el

837

838

839def fromstring(html, base_url=None, parser=None, **kw):

840 """

841 Parse the html, returning a single element/document.

842

843 This tries to minimally parse the chunk of text, without knowing if it

844 is a fragment or a document.

845

846 base_url will set the document's base_url attribute (and the tree's docinfo.URL)

847 """

848 if parser is None:

849 parser = html_parser

850 if isinstance(html, bytes):

851 is_full_html = _looks_like_full_html_bytes(html)

852 else:

853 is_full_html = _looks_like_full_html_unicode(html)

854 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)

855 if is_full_html:

856 return doc

857 # otherwise, lets parse it out...

858 bodies = doc.findall('body')

859 if not bodies:

860 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)

861 if bodies:

862 body = bodies[0]

863 if len(bodies) > 1:

864 # Somehow there are multiple bodies, which is bad, but just

865 # smash them into one body

866 for other_body in bodies[1:]:

867 if other_body.text:

868 if len(body):

869 body[-1].tail = (body[-1].tail or '') + other_body.text

870 else:

871 body.text = (body.text or '') + other_body.text

872 body.extend(other_body)

873 # We'll ignore tail

874 # I guess we are ignoring attributes too

875 other_body.drop_tree()

876 else:

877 body = None

878 heads = doc.findall('head')

879 if not heads:

880 heads = doc.findall('{%s}head' % XHTML_NAMESPACE)

881 if heads:

882 # Well, we have some sort of structure, so lets keep it all

883 head = heads[0]

884 if len(heads) > 1:

885 for other_head in heads[1:]:

886 head.extend(other_head)

887 # We don't care about text or tail in a head

888 other_head.drop_tree()

889 return doc

890 if body is None:

891 return doc

892 if (len(body) == 1 and (not body.text or not body.text.strip())

893 and (not body[-1].tail or not body[-1].tail.strip())):

894 # The body has just one element, so it was probably a single

895 # element passed in

896 return body[0]

897 # Now we have a body which represents a bunch of tags which have the

898 # content that was passed in. We will create a fake container, which

899 # is the body tag, except <body> implies too much structure.

900 if _contains_block_level_tag(body):

901 body.tag = 'div'

902 else:

903 body.tag = 'span'

904 return body

905

906

907def parse(filename_or_url, parser=None, base_url=None, **kw):

908 """

909 Parse a filename, URL, or file-like object into an HTML document

910 tree. Note: this returns a tree, not an element. Use

911 ``parse(...).getroot()`` to get the document root.

912

913 You can override the base URL with the ``base_url`` keyword. This

914 is most useful when parsing from a file-like object.

915 """

916 if parser is None:

917 parser = html_parser

918 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

919

920

921def _contains_block_level_tag(el):

922 # FIXME: I could do this with XPath, but would that just be

923 # unnecessarily slow?

924 for el in el.iter(etree.Element):

925 if _nons(el.tag) in defs.block_tags:

926 return True

927 return False

928

929

930def _element_name(el):

931 if isinstance(el, etree.CommentBase):

932 return 'comment'

933 elif isinstance(el, str):

934 return 'string'

935 else:

936 return _nons(el.tag)

937

938

939################################################################################

940# form handling

941################################################################################

942

943class FormElement(HtmlElement):

944 """

945 Represents a <form> element.

946 """

947

948 @property

949 def inputs(self):

950 """

951 Returns an accessor for all the input elements in the form.

952

953 See `InputGetter` for more information about the object.

954 """

955 return InputGetter(self)

956

957 @property

958 def fields(self):

959 """

960 Dictionary-like object that represents all the fields in this

961 form. You can set values in this dictionary to effect the

962 form.

963 """

964 return FieldsDict(self.inputs)

965

966 @fields.setter

967 def fields(self, value):

968 fields = self.fields

969 prev_keys = fields.keys()

970 for key, value in value.items():

971 if key in prev_keys:

972 prev_keys.remove(key)

973 fields[key] = value

974 for key in prev_keys:

975 if key is None:

976 # Case of an unnamed input; these aren't really

977 # expressed in form_values() anyway.

978 continue

979 fields[key] = None

980

981 def _name(self):

982 if self.get('name'):

983 return self.get('name')

984 elif self.get('id'):

985 return '#' + self.get('id')

986 iter_tags = self.body.iter

987 forms = list(iter_tags('form'))

988 if not forms:

989 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))

990 return str(forms.index(self))

991

992 def form_values(self):

993 """

994 Return a list of tuples of the field values for the form.

995 This is suitable to be passed to ``urllib.urlencode()``.

996 """

997 results = []

998 for el in self.inputs:

999 name = el.name

1000 if not name or 'disabled' in el.attrib:

1001 continue

1002 tag = _nons(el.tag)

1003 if tag == 'textarea':

1004 results.append((name, el.value))

1005 elif tag == 'select':

1006 value = el.value

1007 if el.multiple:

1008 for v in value:

1009 results.append((name, v))

1010 elif value is not None:

1011 results.append((name, el.value))

1012 else:

1013 assert tag == 'input', (

1014 "Unexpected tag: %r" % el)

1015 if el.checkable and not el.checked:

1016 continue

1017 if el.type in ('submit', 'image', 'reset', 'file'):

1018 continue

1019 value = el.value

1020 if value is not None:

1021 results.append((name, el.value))

1022 return results

1023

1024 @property

1025 def action(self):

1026 """

1027 Get/set the form's ``action`` attribute.

1028 """

1029 base_url = self.base_url

1030 action = self.get('action')

1031 if base_url and action is not None:

1032 return urljoin(base_url, action)

1033 else:

1034 return action

1035

1036 @action.setter

1037 def action(self, value):

1038 self.set('action', value)

1039

1040 @action.deleter

1041 def action(self):

1042 attrib = self.attrib

1043 if 'action' in attrib:

1044 del attrib['action']

1045

1046 @property

1047 def method(self):

1048 """

1049 Get/set the form's method. Always returns a capitalized

1050 string, and defaults to ``'GET'``

1051 """

1052 return self.get('method', 'GET').upper()

1053

1054 @method.setter

1055 def method(self, value):

1056 self.set('method', value.upper())

1057

1058

1059HtmlElementClassLookup._default_element_classes['form'] = FormElement

1060

1061

1062def submit_form(form, extra_values=None, open_http=None):

1063 """

1064 Helper function to submit a form. Returns a file-like object, as from

1065 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,

1066 which shows the URL if there were any redirects.

1067

1068 You can use this like::

1069

1070 form = doc.forms[0]

1071 form.inputs['foo'].value = 'bar' # etc

1072 response = form.submit()

1073 doc = parse(response)

1074 doc.make_links_absolute(response.geturl())

1075

1076 To change the HTTP requester, pass a function as ``open_http`` keyword

1077 argument that opens the URL for you. The function must have the following

1078 signature::

1079

1080 open_http(method, URL, values)

1081

1082 The action is one of 'GET' or 'POST', the URL is the target URL as a

1083 string, and the values are a sequence of ``(name, value)`` tuples with the

1084 form data.

1085 """

1086 values = form.form_values()

1087 if extra_values:

1088 if hasattr(extra_values, 'items'):

1089 extra_values = extra_values.items()

1090 values.extend(extra_values)

1091 if open_http is None:

1092 open_http = open_http_urllib

1093 if form.action:

1094 url = form.action

1095 else:

1096 url = form.base_url

1097 return open_http(form.method, url, values)

1098

1099

1100def open_http_urllib(method, url, values):

1101 if not url:

1102 raise ValueError("cannot submit, no URL provided")

1103 ## FIXME: should test that it's not a relative URL or something

1104 try:

1105 from urllib import urlencode, urlopen

1106 except ImportError: # Python 3

1107 from urllib.request import urlopen

1108 from urllib.parse import urlencode

1109 if method == 'GET':

1110 if '?' in url:

1111 url += '&'

1112 else:

1113 url += '?'

1114 url += urlencode(values)

1115 data = None

1116 else:

1117 data = urlencode(values)

1118 if not isinstance(data, bytes):

1119 data = data.encode('ASCII')

1120 return urlopen(url, data)

1121

1122

1123class FieldsDict(MutableMapping):

1124

1125 def __init__(self, inputs):

1126 self.inputs = inputs

1127 def __getitem__(self, item):

1128 return self.inputs[item].value

1129 def __setitem__(self, item, value):

1130 self.inputs[item].value = value

1131 def __delitem__(self, item):

1132 raise KeyError(

1133 "You cannot remove keys from ElementDict")

1134 def keys(self):

1135 return self.inputs.keys()

1136 def __contains__(self, item):

1137 return item in self.inputs

1138 def __iter__(self):

1139 return iter(self.inputs.keys())

1140 def __len__(self):

1141 return len(self.inputs)

1142

1143 def __repr__(self):

1144 return '<%s for form %s>' % (

1145 self.__class__.__name__,

1146 self.inputs.form._name())

1147

1148

1149class InputGetter:

1150

1151 """

1152 An accessor that represents all the input fields in a form.

1153

1154 You can get fields by name from this, with

1155 ``form.inputs['field_name']``. If there are a set of checkboxes

1156 with the same name, they are returned as a list (a `CheckboxGroup`

1157 which also allows value setting). Radio inputs are handled

1158 similarly. Use ``.keys()`` and ``.items()`` to process all fields

1159 in this way.

1160

1161 You can also iterate over this to get all input elements. This

1162 won't return the same thing as if you get all the names, as

1163 checkboxes and radio elements are returned individually.

1164 """

1165

1166 def __init__(self, form):

1167 self.form = form

1168

1169 def __repr__(self):

1170 return '<%s for form %s>' % (

1171 self.__class__.__name__,

1172 self.form._name())

1173

1174 ## FIXME: there should be more methods, and it's unclear if this is

1175 ## a dictionary-like object or list-like object

1176

1177 def __getitem__(self, name):

1178 fields = [field for field in self if field.name == name]

1179 if not fields:

1180 raise KeyError("No input element with the name %r" % name)

1181

1182 input_type = fields[0].get('type')

1183 if input_type == 'radio' and len(fields) > 1:

1184 group = RadioGroup(fields)

1185 group.name = name

1186 return group

1187 elif input_type == 'checkbox' and len(fields) > 1:

1188 group = CheckboxGroup(fields)

1189 group.name = name

1190 return group

1191 else:

1192 # I don't like throwing away elements like this

1193 return fields[0]

1194

1195 def __contains__(self, name):

1196 for field in self:

1197 if field.name == name:

1198 return True

1199 return False

1200

1201 def keys(self):

1202 """

1203 Returns all unique field names, in document order.

1204

1205 :return: A list of all unique field names.

1206 """

1207 names = []

1208 seen = {None}

1209 for el in self:

1210 name = el.name

1211 if name not in seen:

1212 names.append(name)

1213 seen.add(name)

1214 return names

1215

1216 def items(self):

1217 """

1218 Returns all fields with their names, similar to dict.items().

1219

1220 :return: A list of (name, field) tuples.

1221 """

1222 items = []

1223 seen = set()

1224 for el in self:

1225 name = el.name

1226 if name not in seen:

1227 seen.add(name)

1228 items.append((name, self[name]))

1229 return items

1230

1231 def __iter__(self):

1232 return self.form.iter('select', 'input', 'textarea')

1233

1234 def __len__(self):

1235 return sum(1 for _ in self)

1236

1237

1238class InputMixin:

1239 """

1240 Mix-in for all input elements (input, select, and textarea)

1241 """

1242 @property

1243 def name(self):

1244 """

1245 Get/set the name of the element

1246 """

1247 return self.get('name')

1248

1249 @name.setter

1250 def name(self, value):

1251 self.set('name', value)

1252

1253 @name.deleter

1254 def name(self):

1255 attrib = self.attrib

1256 if 'name' in attrib:

1257 del attrib['name']

1258

1259 def __repr__(self):

1260 type_name = getattr(self, 'type', None)

1261 if type_name:

1262 type_name = ' type=%r' % type_name

1263 else:

1264 type_name = ''

1265 return '<%s %x name=%r%s>' % (

1266 self.__class__.__name__, id(self), self.name, type_name)

1267

1268

1269class TextareaElement(InputMixin, HtmlElement):

1270 """

1271 ``<textarea>`` element. You can get the name with ``.name`` and

1272 get/set the value with ``.value``

1273 """

1274 @property

1275 def value(self):

1276 """

1277 Get/set the value (which is the contents of this element)

1278 """

1279 content = self.text or ''

1280 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):

1281 serialisation_method = 'xml'

1282 else:

1283 serialisation_method = 'html'

1284 for el in self:

1285 # it's rare that we actually get here, so let's not use ''.join()

1286 content += etree.tostring(

1287 el, method=serialisation_method, encoding='unicode')

1288 return content

1289

1290 @value.setter

1291 def value(self, value):

1292 del self[:]

1293 self.text = value

1294

1295 @value.deleter

1296 def value(self):

1297 self.text = ''

1298 del self[:]

1299

1300

1301HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1302

1303

1304class SelectElement(InputMixin, HtmlElement):

1305 """

1306 ``<select>`` element. You can get the name with ``.name``.

1307

1308 ``.value`` will be the value of the selected option, unless this

1309 is a multi-select element (``<select multiple>``), in which case

1310 it will be a set-like object. In either case ``.value_options``

1311 gives the possible values.

1312

1313 The boolean attribute ``.multiple`` shows if this is a

1314 multi-select.

1315 """

1316 @property

1317 def value(self):

1318 """

1319 Get/set the value of this select (the selected option).

1320

1321 If this is a multi-select, this is a set-like object that

1322 represents all the selected options.

1323 """

1324 if self.multiple:

1325 return MultipleSelectOptions(self)

1326 options = _options_xpath(self)

1327

1328 try:

1329 selected_option = next(el for el in reversed(options) if el.get('selected') is not None)

1330 except StopIteration:

1331 try:

1332 selected_option = next(el for el in options if el.get('disabled') is None)

1333 except StopIteration:

1334 return None

1335 value = selected_option.get('value')

1336 if value is None:

1337 value = (selected_option.text or '').strip()

1338 return value

1339

1340 @value.setter

1341 def value(self, value):

1342 if self.multiple:

1343 if isinstance(value, str):

1344 raise TypeError("You must pass in a sequence")

1345 values = self.value

1346 values.clear()

1347 values.update(value)

1348 return

1349 checked_option = None

1350 if value is not None:

1351 for el in _options_xpath(self):

1352 opt_value = el.get('value')

1353 if opt_value is None:

1354 opt_value = (el.text or '').strip()

1355 if opt_value == value:

1356 checked_option = el

1357 break

1358 else:

1359 raise ValueError(

1360 "There is no option with the value of %r" % value)

1361 for el in _options_xpath(self):

1362 if 'selected' in el.attrib:

1363 del el.attrib['selected']

1364 if checked_option is not None:

1365 checked_option.set('selected', '')

1366

1367 @value.deleter

1368 def value(self):

1369 # FIXME: should del be allowed at all?

1370 if self.multiple:

1371 self.value.clear()

1372 else:

1373 self.value = None

1374

1375 @property

1376 def value_options(self):

1377 """

1378 All the possible values this select can have (the ``value``

1379 attribute of all the ``<option>`` elements.

1380 """

1381 options = []

1382 for el in _options_xpath(self):

1383 value = el.get('value')

1384 if value is None:

1385 value = (el.text or '').strip()

1386 options.append(value)

1387 return options

1388

1389 @property

1390 def multiple(self):

1391 """

1392 Boolean attribute: is there a ``multiple`` attribute on this element.

1393 """

1394 return 'multiple' in self.attrib

1395

1396 @multiple.setter

1397 def multiple(self, value):

1398 if value:

1399 self.set('multiple', '')

1400 elif 'multiple' in self.attrib:

1401 del self.attrib['multiple']

1402

1403

1404HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1405

1406

1407class MultipleSelectOptions(SetMixin):

1408 """

1409 Represents all the selected options in a ``<select multiple>`` element.

1410

1411 You can add to this set-like option to select an option, or remove

1412 to unselect the option.

1413 """

1414

1415 def __init__(self, select):

1416 self.select = select

1417

1418 @property

1419 def options(self):

1420 """

1421 Iterator of all the ``<option>`` elements.

1422 """

1423 return iter(_options_xpath(self.select))

1424

1425 def __iter__(self):

1426 for option in self.options:

1427 if 'selected' in option.attrib:

1428 opt_value = option.get('value')

1429 if opt_value is None:

1430 opt_value = (option.text or '').strip()

1431 yield opt_value

1432

1433 def add(self, item):

1434 for option in self.options:

1435 opt_value = option.get('value')

1436 if opt_value is None:

1437 opt_value = (option.text or '').strip()

1438 if opt_value == item:

1439 option.set('selected', '')

1440 break

1441 else:

1442 raise ValueError(

1443 "There is no option with the value %r" % item)

1444

1445 def remove(self, item):

1446 for option in self.options:

1447 opt_value = option.get('value')

1448 if opt_value is None:

1449 opt_value = (option.text or '').strip()

1450 if opt_value == item:

1451 if 'selected' in option.attrib:

1452 del option.attrib['selected']

1453 else:

1454 raise ValueError(

1455 "The option %r is not currently selected" % item)

1456 break

1457 else:

1458 raise ValueError(

1459 "There is not option with the value %r" % item)

1460

1461 def __repr__(self):

1462 return '<%s {%s} for select name=%r>' % (

1463 self.__class__.__name__,

1464 ', '.join([repr(v) for v in self]),

1465 self.select.name)

1466

1467

1468class RadioGroup(list):

1469 """

1470 This object represents several ``<input type=radio>`` elements

1471 that have the same name.

1472

1473 You can use this like a list, but also use the property

1474 ``.value`` to check/uncheck inputs. Also you can use

1475 ``.value_options`` to get the possible values.

1476 """

1477 @property

1478 def value(self):

1479 """

1480 Get/set the value, which checks the radio with that value (and

1481 unchecks any other value).

1482 """

1483 for el in self:

1484 if 'checked' in el.attrib:

1485 return el.get('value')

1486 return None

1487

1488 @value.setter

1489 def value(self, value):

1490 checked_option = None

1491 if value is not None:

1492 for el in self:

1493 if el.get('value') == value:

1494 checked_option = el

1495 break

1496 else:

1497 raise ValueError("There is no radio input with the value %r" % value)

1498 for el in self:

1499 if 'checked' in el.attrib:

1500 del el.attrib['checked']

1501 if checked_option is not None:

1502 checked_option.set('checked', '')

1503

1504 @value.deleter

1505 def value(self):

1506 self.value = None

1507

1508 @property

1509 def value_options(self):

1510 """

1511 Returns a list of all the possible values.

1512 """

1513 return [el.get('value') for el in self]

1514

1515 def __repr__(self):

1516 return '%s(%s)' % (

1517 self.__class__.__name__,

1518 list.__repr__(self))

1519

1520

1521class CheckboxGroup(list):

1522 """

1523 Represents a group of checkboxes (``<input type=checkbox>``) that

1524 have the same name.

1525

1526 In addition to using this like a list, the ``.value`` attribute

1527 returns a set-like object that you can add to or remove from to

1528 check and uncheck checkboxes. You can also use ``.value_options``

1529 to get the possible values.

1530 """

1531 @property

1532 def value(self):

1533 """

1534 Return a set-like object that can be modified to check or

1535 uncheck individual checkboxes according to their value.

1536 """

1537 return CheckboxValues(self)

1538

1539 @value.setter

1540 def value(self, value):

1541 values = self.value

1542 values.clear()

1543 if not hasattr(value, '__iter__'):

1544 raise ValueError(

1545 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"

1546 % (self[0].name, value))

1547 values.update(value)

1548

1549 @value.deleter

1550 def value(self):

1551 self.value.clear()

1552

1553 @property

1554 def value_options(self):

1555 """

1556 Returns a list of all the possible values.

1557 """

1558 return [el.get('value') for el in self]

1559

1560 def __repr__(self):

1561 return '%s(%s)' % (

1562 self.__class__.__name__, list.__repr__(self))

1563

1564

1565class CheckboxValues(SetMixin):

1566 """

1567 Represents the values of the checked checkboxes in a group of

1568 checkboxes with the same name.

1569 """

1570

1571 def __init__(self, group):

1572 self.group = group

1573

1574 def __iter__(self):

1575 return iter([

1576 el.get('value')

1577 for el in self.group

1578 if 'checked' in el.attrib])

1579

1580 def add(self, value):

1581 for el in self.group:

1582 if el.get('value') == value:

1583 el.set('checked', '')

1584 break

1585 else:

1586 raise KeyError("No checkbox with value %r" % value)

1587

1588 def remove(self, value):

1589 for el in self.group:

1590 if el.get('value') == value:

1591 if 'checked' in el.attrib:

1592 del el.attrib['checked']

1593 else:

1594 raise KeyError(

1595 "The checkbox with value %r was already unchecked" % value)

1596 break

1597 else:

1598 raise KeyError(

1599 "No checkbox with value %r" % value)

1600

1601 def __repr__(self):

1602 return '<%s {%s} for checkboxes name=%r>' % (

1603 self.__class__.__name__,

1604 ', '.join([repr(v) for v in self]),

1605 self.group.name)

1606

1607

1608class InputElement(InputMixin, HtmlElement):

1609 """

1610 Represents an ``<input>`` element.

1611

1612 You can get the type with ``.type`` (which is lower-cased and

1613 defaults to ``'text'``).

1614

1615 Also you can get and set the value with ``.value``

1616

1617 Checkboxes and radios have the attribute ``input.checkable ==

1618 True`` (for all others it is false) and a boolean attribute

1619 ``.checked``.

1620

1621 """

1622

1623 ## FIXME: I'm a little uncomfortable with the use of .checked

1624 @property

1625 def value(self):

1626 """

1627 Get/set the value of this element, using the ``value`` attribute.

1628

1629 Also, if this is a checkbox and it has no value, this defaults

1630 to ``'on'``. If it is a checkbox or radio that is not

1631 checked, this returns None.

1632 """

1633 if self.checkable:

1634 if self.checked:

1635 return self.get('value') or 'on'

1636 else:

1637 return None

1638 return self.get('value')

1639

1640 @value.setter

1641 def value(self, value):

1642 if self.checkable:

1643 if not value:

1644 self.checked = False

1645 else:

1646 self.checked = True

1647 if isinstance(value, str):

1648 self.set('value', value)

1649 else:

1650 self.set('value', value)

1651

1652 @value.deleter

1653 def value(self):

1654 if self.checkable:

1655 self.checked = False

1656 else:

1657 if 'value' in self.attrib:

1658 del self.attrib['value']

1659

1660 @property

1661 def type(self):

1662 """

1663 Return the type of this element (using the type attribute).

1664 """

1665 return self.get('type', 'text').lower()

1666

1667 @type.setter

1668 def type(self, value):

1669 self.set('type', value)

1670

1671 @property

1672 def checkable(self):

1673 """

1674 Boolean: can this element be checked?

1675 """

1676 return self.type in ('checkbox', 'radio')

1677

1678 @property

1679 def checked(self):

1680 """

1681 Boolean attribute to get/set the presence of the ``checked``

1682 attribute.

1683

1684 You can only use this on checkable input types.

1685 """

1686 if not self.checkable:

1687 raise AttributeError('Not a checkable input type')

1688 return 'checked' in self.attrib

1689

1690 @checked.setter

1691 def checked(self, value):

1692 if not self.checkable:

1693 raise AttributeError('Not a checkable input type')

1694 if value:

1695 self.set('checked', '')

1696 else:

1697 attrib = self.attrib

1698 if 'checked' in attrib:

1699 del attrib['checked']

1700

1701

1702HtmlElementClassLookup._default_element_classes['input'] = InputElement

1703

1704

1705class LabelElement(HtmlElement):

1706 """

1707 Represents a ``<label>`` element.

1708

1709 Label elements are linked to other elements with their ``for``

1710 attribute. You can access this element with ``label.for_element``.

1711 """

1712 @property

1713 def for_element(self):

1714 """

1715 Get/set the element this label points to. Return None if it

1716 can't be found.

1717 """

1718 id = self.get('for')

1719 if not id:

1720 return None

1721 return self.body.get_element_by_id(id)

1722

1723 @for_element.setter

1724 def for_element(self, other):

1725 id = other.get('id')

1726 if not id:

1727 raise TypeError(

1728 "Element %r has no id attribute" % other)

1729 self.set('for', id)

1730

1731 @for_element.deleter

1732 def for_element(self):

1733 attrib = self.attrib

1734 if 'id' in attrib:

1735 del attrib['id']

1736

1737

1738HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1739

1740

1741############################################################

1742## Serialization

1743############################################################

1744

1745def html_to_xhtml(html):

1746 """Convert all tags in an HTML tree to XHTML by moving them to the

1747 XHTML namespace.

1748 """

1749 try:

1750 html = html.getroot()

1751 except AttributeError:

1752 pass

1753 prefix = "{%s}" % XHTML_NAMESPACE

1754 for el in html.iter(etree.Element):

1755 tag = el.tag

1756 if tag[0] != '{':

1757 el.tag = prefix + tag

1758

1759

1760def xhtml_to_html(xhtml):

1761 """Convert all tags in an XHTML tree to HTML by removing their

1762 XHTML namespace.

1763 """

1764 try:

1765 xhtml = xhtml.getroot()

1766 except AttributeError:

1767 pass

1768 prefix = "{%s}" % XHTML_NAMESPACE

1769 prefix_len = len(prefix)

1770 for el in xhtml.iter(prefix + "*"):

1771 el.tag = el.tag[prefix_len:]

1772

1773

1774# This isn't a general match, but it's a match for what libxml2

1775# specifically serialises:

1776__str_replace_meta_content_type = re.compile(

1777 r'<meta http-equiv="Content-Type"[^>]*>').sub

1778__bytes_replace_meta_content_type = re.compile(

1779 br'<meta http-equiv="Content-Type"[^>]*>').sub

1780

1781

1782def tostring(doc, pretty_print=False, include_meta_content_type=False,

1783 encoding=None, method="html", with_tail=True, doctype=None):

1784 """Return an HTML string representation of the document.

1785

1786 Note: if include_meta_content_type is true this will create a

1787 ``<meta http-equiv="Content-Type" ...>`` tag in the head;

1788 regardless of the value of include_meta_content_type any existing

1789 ``<meta http-equiv="Content-Type" ...>`` tag will be removed

1790

1791 The ``encoding`` argument controls the output encoding (defaults to

1792 ASCII, with &#...; character references for any characters outside

1793 of ASCII). Note that you can pass the name ``'unicode'`` as

1794 ``encoding`` argument to serialise to a Unicode string.

1795

1796 The ``method`` argument defines the output method. It defaults to

1797 'html', but can also be 'xml' for xhtml output, or 'text' to

1798 serialise to plain text without markup.

1799

1800 To leave out the tail text of the top-level element that is being

1801 serialised, pass ``with_tail=False``.

1802

1803 The ``doctype`` option allows passing in a plain string that will

1804 be serialised before the XML tree. Note that passing in non

1805 well-formed content here will make the XML output non well-formed.

1806 Also, an existing doctype in the document tree will not be removed

1807 when serialising an ElementTree instance.

1808

1809 Example::

1810

1811 >>> from lxml import html

1812 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')

1813

1814 >>> html.tostring(root)

1815 b'<p>Hello<br>world!</p>'

1816 >>> html.tostring(root, method='html')

1817 b'<p>Hello<br>world!</p>'

1818

1819 >>> html.tostring(root, method='xml')

1820 b'<p>Hello<br/>world!</p>'

1821

1822 >>> html.tostring(root, method='text')

1823 b'Helloworld!'

1824

1825 >>> html.tostring(root, method='text', encoding='unicode')

1826 u'Helloworld!'

1827

1828 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')

1829 >>> html.tostring(root[0], method='text', encoding='unicode')

1830 u'Helloworld!TAIL'

1831

1832 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)

1833 u'Helloworld!'

1834

1835 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')

1836 >>> html.tostring(doc, method='html', encoding='unicode')

1837 u'<html><body><p>Hello<br>world!</p></body></html>'

1838

1839 >>> print(html.tostring(doc, method='html', encoding='unicode',

1840 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'

1841 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))

1842 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">

1843 <html><body><p>Hello<br>world!</p></body></html>

1844 """

1845 html = etree.tostring(doc, method=method, pretty_print=pretty_print,

1846 encoding=encoding, with_tail=with_tail,

1847 doctype=doctype)

1848 if method == 'html' and not include_meta_content_type:

1849 if isinstance(html, str):

1850 html = __str_replace_meta_content_type('', html)

1851 else:

1852 html = __bytes_replace_meta_content_type(b'', html)

1853 return html

1854

1855

1856tostring.__doc__ = __fix_docstring(tostring.__doc__)

1857

1858

1859def open_in_browser(doc, encoding=None):

1860 """

1861 Open the HTML document in a web browser, saving it to a temporary

1862 file to open it. Note that this does not delete the file after

1863 use. This is mainly meant for debugging.

1864 """

1865 import os

1866 import webbrowser

1867 import tempfile

1868 if not isinstance(doc, etree._ElementTree):

1869 doc = etree.ElementTree(doc)

1870 handle, fn = tempfile.mkstemp(suffix='.html')

1871 f = os.fdopen(handle, 'wb')

1872 try:

1873 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")

1874 finally:

1875 # we leak the file itself here, but we should at least close it

1876 f.close()

1877 url = 'file://' + fn.replace(os.path.sep, '/')

1878 print(url)

1879 webbrowser.open(url)

1880

1881

1882################################################################################

1883# configure Element class lookup

1884################################################################################

1885

1886class HTMLParser(etree.HTMLParser):

1887 """An HTML parser that is configured to return lxml.html Element

1888 objects.

1889 """

1890 def __init__(self, **kwargs):

1891 super().__init__(**kwargs)

1892 self.set_element_class_lookup(HtmlElementClassLookup())

1893

1894

1895class XHTMLParser(etree.XMLParser):

1896 """An XML parser that is configured to return lxml.html Element

1897 objects.

1898

1899 Note that this parser is not really XHTML aware unless you let it

1900 load a DTD that declares the HTML entities. To do this, make sure

1901 you have the XHTML DTDs installed in your catalogs, and create the

1902 parser like this::

1903

1904 >>> parser = XHTMLParser(load_dtd=True)

1905

1906 If you additionally want to validate the document, use this::

1907

1908 >>> parser = XHTMLParser(dtd_validation=True)

1909

1910 For catalog support, see http://www.xmlsoft.org/catalog.html.

1911 """

1912 def __init__(self, **kwargs):

1913 super().__init__(**kwargs)

1914 self.set_element_class_lookup(HtmlElementClassLookup())

1915

1916

1917def Element(*args, **kw):

1918 """Create a new HTML Element.

1919

1920 This can also be used for XHTML documents.

1921 """

1922 v = html_parser.makeelement(*args, **kw)

1923 return v

1924

1925

1926html_parser = HTMLParser()

1927xhtml_parser = XHTMLParser()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lxml/html/init.py: 3%

972 statements