Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/genshi/core.py: 57%

1# -*- coding: utf-8 -*-

6# This software is licensed as described in the file COPYING, which

7# you should have received as part of this distribution. The terms

8# are also available at http://genshi.edgewall.org/wiki/License.

10# This software consists of voluntary contributions made by many

11# individuals. For the exact contribution history, see the revision

12# history and logs, available at http://genshi.edgewall.org/log/.

14"""Core classes for markup processing."""

16from functools import reduce

17import sys

18from itertools import chain

19import operator

21from genshi.compat import stringrepr, string_types, text_type

22from genshi.util import stripentities, striptags

24__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',

25 'QName']

26__docformat__ = 'restructuredtext en'

29class StreamEventKind(str):

30 """A kind of event on a markup stream."""

31 __slots__ = []

32 _instances = {}

34 def __new__(cls, val):

35 return cls._instances.setdefault(val, str.__new__(cls, val))

38class Stream(object):

39 """Represents a stream of markup events.

41 This class is basically an iterator over the events.

43 Stream events are tuples of the form::

45 (kind, data, position)

47 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),

48 ``data`` depends on the kind of event, and ``position`` is a

49 ``(filename, line, offset)`` tuple that contains the location of the

50 original element or text in the input. If the original location is unknown,

51 ``position`` is ``(None, -1, -1)``.

53 Also provided are ways to serialize the stream to text. The `serialize()`

54 method will return an iterator over generated strings, while `render()`

55 returns the complete generated text at once. Both accept various parameters

56 that impact the way the stream is serialized.

57 """

58 __slots__ = ['events', 'serializer']

60 START = StreamEventKind('START') #: a start tag

61 END = StreamEventKind('END') #: an end tag

62 TEXT = StreamEventKind('TEXT') #: literal text

63 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration

64 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration

65 START_NS = StreamEventKind('START_NS') #: start namespace mapping

66 END_NS = StreamEventKind('END_NS') #: end namespace mapping

67 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section

68 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section

69 PI = StreamEventKind('PI') #: processing instruction

70 COMMENT = StreamEventKind('COMMENT') #: comment

72 def __init__(self, events, serializer=None):

73 """Initialize the stream with a sequence of markup events.

75 :param events: a sequence or iterable providing the events

76 :param serializer: the default serialization method to use for this

77 stream

79 :note: Changed in 0.5: added the `serializer` argument

80 """

81 self.events = events #: The underlying iterable producing the events

82 self.serializer = serializer #: The default serializion method

84 def __iter__(self):

85 return iter(self.events)

87 def __or__(self, function):

88 """Override the "bitwise or" operator to apply filters or serializers

89 to the stream, providing a syntax similar to pipes on Unix shells.

91 Assume the following stream produced by the `HTML` function:

93 >>> from genshi.input import HTML

94 >>> html = HTML('''Hello, world!''', encoding='utf-8')

95 >>> print(html)

96 Hello, world!

98 A filter such as the HTML sanitizer can be applied to that stream using

99 the pipe notation as follows:

100

101 >>> from genshi.filters import HTMLSanitizer

102 >>> sanitizer = HTMLSanitizer()

103 >>> print(html | sanitizer)

104 Hello, world!

105

106 Filters can be any function that accepts and produces a stream (where

107 a stream is anything that iterates over events):

108

109 >>> def uppercase(stream):

110 ... for kind, data, pos in stream:

111 ... if kind is TEXT:

112 ... data = data.upper()

113 ... yield kind, data, pos

114 >>> print(html | sanitizer | uppercase)

115 HELLO, WORLD!

116

117 Serializers can also be used with this notation:

118

119 >>> from genshi.output import TextSerializer

120 >>> output = TextSerializer()

121 >>> print(html | sanitizer | uppercase | output)

122 HELLO, WORLD!

123

124 Commonly, serializers should be used at the end of the "pipeline";

125 using them somewhere in the middle may produce unexpected results.

126

127 :param function: the callable object that should be applied as a filter

128 :return: the filtered stream

129 :rtype: `Stream`

130 """

131 return Stream(_ensure(function(self)), serializer=self.serializer)

132

133 def filter(self, *filters):

134 """Apply filters to the stream.

135

136 This method returns a new stream with the given filters applied. The

137 filters must be callables that accept the stream object as parameter,

138 and return the filtered stream.

139

140 The call::

141

142 stream.filter(filter1, filter2)

143

144 is equivalent to::

145

146 stream | filter1 | filter2

147

148 :param filters: one or more callable objects that should be applied as

149 filters

150 :return: the filtered stream

151 :rtype: `Stream`

152 """

153 return reduce(operator.or_, (self,) + filters)

154

155 def render(self, method=None, encoding=None, out=None, **kwargs):

156 """Return a string representation of the stream.

157

158 Any additional keyword arguments are passed to the serializer, and thus

159 depend on the `method` parameter value.

160

161 :param method: determines how the stream is serialized; can be either

162 "xml", "xhtml", "html", "text", or a custom serializer

163 class; if `None`, the default serialization method of

164 the stream is used

165 :param encoding: how the output string should be encoded; if set to

166 `None`, this method returns a `unicode` object

167 :param out: a file-like object that the output should be written to

168 instead of being returned as one big string; note that if

169 this is a file or socket (or similar), the `encoding` must

170 not be `None` (that is, the output must be encoded)

171 :return: a `str` or `unicode` object (depending on the `encoding`

172 parameter), or `None` if the `out` parameter is provided

173 :rtype: `basestring`

174

175 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer

176 :note: Changed in 0.5: added the `out` parameter

177 """

178 from genshi.output import encode

179 if method is None:

180 method = self.serializer or 'xml'

181 generator = self.serialize(method=method, **kwargs)

182 return encode(generator, method=method, encoding=encoding, out=out)

183

184 def select(self, path, namespaces=None, variables=None):

185 """Return a new stream that contains the events matching the given

186 XPath expression.

187

188 >>> from genshi import HTML

189 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>', encoding='utf-8')

190 >>> print(stream.select('elem'))

191 <elem>foo</elem><elem>bar</elem>

192 >>> print(stream.select('elem/text()'))

193 foobar

194

195 Note that the outermost element of the stream becomes the *context

196 node* for the XPath test. That means that the expression "doc" would

197 not match anything in the example above, because it only tests against

198 child elements of the outermost element:

199

200 >>> print(stream.select('doc'))

201 <BLANKLINE>

202

203 You can use the "." expression to match the context node itself

204 (although that usually makes little sense):

205

206 >>> print(stream.select('.'))

207 <doc><elem>foo</elem><elem>bar</elem></doc>

208

209 :param path: a string containing the XPath expression

210 :param namespaces: mapping of namespace prefixes used in the path

211 :param variables: mapping of variable names to values

212 :return: the selected substream

213 :rtype: `Stream`

214 :raises PathSyntaxError: if the given path expression is invalid or not

215 supported

216 """

217 from genshi.path import Path

218 return Path(path).select(self, namespaces, variables)

219

220 def serialize(self, method='xml', **kwargs):

221 """Generate strings corresponding to a specific serialization of the

222 stream.

223

224 Unlike the `render()` method, this method is a generator that returns

225 the serialized output incrementally, as opposed to returning a single

226 string.

227

228 Any additional keyword arguments are passed to the serializer, and thus

229 depend on the `method` parameter value.

230

231 :param method: determines how the stream is serialized; can be either

232 "xml", "xhtml", "html", "text", or a custom serializer

233 class; if `None`, the default serialization method of

234 the stream is used

235 :return: an iterator over the serialization results (`Markup` or

236 `unicode` objects, depending on the serialization method)

237 :rtype: ``iterator``

238 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer

239 """

240 from genshi.output import get_serializer

241 if method is None:

242 method = self.serializer or 'xml'

243 return get_serializer(method, **kwargs)(_ensure(self))

244

245 def __str__(self):

246 return self.render()

247

248 def __unicode__(self):

249 return self.render(encoding=None)

250

251 def __html__(self):

252 return self

253

254

255START = Stream.START

256END = Stream.END

257TEXT = Stream.TEXT

258XML_DECL = Stream.XML_DECL

259DOCTYPE = Stream.DOCTYPE

260START_NS = Stream.START_NS

261END_NS = Stream.END_NS

262START_CDATA = Stream.START_CDATA

263END_CDATA = Stream.END_CDATA

264PI = Stream.PI

265COMMENT = Stream.COMMENT

266

267

268def _ensure(stream):

269 """Ensure that every item on the stream is actually a markup event."""

270 stream = iter(stream)

271 try:

272 event = next(stream)

273 except StopIteration:

274 return

275

276 # Check whether the iterable is a real markup event stream by examining the

277 # first item it yields; if it's not we'll need to do some conversion

278 if type(event) is not tuple or len(event) != 3:

279 for event in chain([event], stream):

280 if hasattr(event, 'totuple'):

281 event = event.totuple()

282 else:

283 event = TEXT, text_type(event), (None, -1, -1)

284 yield event

285 return

286

287 # This looks like a markup event stream, so we'll just pass it through

288 # unchanged

289 yield event

290 for event in stream:

291 yield event

292

293

294class Attrs(tuple):

295 """Immutable sequence type that stores the attributes of an element.

296

297 Ordering of the attributes is preserved, while access by name is also

298 supported.

299

300 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])

301 >>> attrs

302 Attrs([('href', '#'), ('title', 'Foo')])

303

304 >>> 'href' in attrs

305 True

306 >>> 'tabindex' in attrs

307 False

308 >>> attrs.get('title')

309 'Foo'

310

311 Instances may not be manipulated directly. Instead, the operators ``|`` and

312 ``-`` can be used to produce new instances that have specific attributes

313 added, replaced or removed.

314

315 To remove an attribute, use the ``-`` operator. The right hand side can be

316 either a string or a set/sequence of strings, identifying the name(s) of

317 the attribute(s) to remove:

318

319 >>> attrs - 'title'

320 Attrs([('href', '#')])

321 >>> attrs - ('title', 'href')

322 Attrs()

323

324 The original instance is not modified, but the operator can of course be

325 used with an assignment:

326

327 >>> attrs

328 Attrs([('href', '#'), ('title', 'Foo')])

329 >>> attrs -= 'title'

330 >>> attrs

331 Attrs([('href', '#')])

332

333 To add a new attribute, use the ``|`` operator, where the right hand value

334 is a sequence of ``(name, value)`` tuples (which includes `Attrs`

335 instances):

336

337 >>> attrs | [('title', 'Bar')]

338 Attrs([('href', '#'), ('title', 'Bar')])

339

340 If the attributes already contain an attribute with a given name, the value

341 of that attribute is replaced:

342

343 >>> attrs | [('href', 'http://example.org/')]

344 Attrs([('href', 'http://example.org/')])

345 """

346 __slots__ = []

347

348 def __contains__(self, name):

349 """Return whether the list includes an attribute with the specified

350 name.

351

352 :return: `True` if the list includes the attribute

353 :rtype: `bool`

354 """

355 for attr, _ in self:

356 if attr == name:

357 return True

358 return False

359

360 def __getitem__(self, i):

361 """Return an item or slice of the attributes list.

362

363 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])

364 >>> attrs[1]

365 ('title', 'Foo')

366 >>> attrs[1:]

367 Attrs([('title', 'Foo')])

368 """

369 items = tuple.__getitem__(self, i)

370 if type(i) is slice:

371 return Attrs(items)

372 return items

373

374 def __getslice__(self, i, j):

375 """Return a slice of the attributes list.

376

377 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])

378 >>> attrs[1:]

379 Attrs([('title', 'Foo')])

380 """

381 return Attrs(tuple.__getslice__(self, i, j))

382

383 def __or__(self, attrs):

384 """Return a new instance that contains the attributes in `attrs` in

385 addition to any already existing attributes. Any attributes in the new

386 set that have a value of `None` are removed.

387

388 :return: a new instance with the merged attributes

389 :rtype: `Attrs`

390 """

391 remove = set([an for an, av in attrs if av is None])

392 replace = dict([(an, av) for an, av in attrs

393 if an in self and av is not None])

394 return Attrs([(sn, replace.get(sn, sv)) for sn, sv in self

395 if sn not in remove] +

396 [(an, av) for an, av in attrs

397 if an not in self and an not in remove])

398

399 def __repr__(self):

400 if not self:

401 return 'Attrs()'

402 return 'Attrs([%s])' % ', '.join([repr(item) for item in self])

403

404 def __sub__(self, names):

405 """Return a new instance with all attributes with a name in `names` are

406 removed.

407

408 :param names: the names of the attributes to remove

409 :return: a new instance with the attribute removed

410 :rtype: `Attrs`

411 """

412 if isinstance(names, string_types):

413 names = (names,)

414 return Attrs([(name, val) for name, val in self if name not in names])

415

416 def get(self, name, default=None):

417 """Return the value of the attribute with the specified name, or the

418 value of the `default` parameter if no such attribute is found.

419

420 :param name: the name of the attribute

421 :param default: the value to return when the attribute does not exist

422 :return: the attribute value, or the `default` value if that attribute

423 does not exist

424 :rtype: `object`

425 """

426 for attr, value in self:

427 if attr == name:

428 return value

429 return default

430

431 def totuple(self):

432 """Return the attributes as a markup event.

433

434 The returned event is a `TEXT` event, the data is the value of all

435 attributes joined together.

436

437 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()

438 ('TEXT', '#Foo', (None, -1, -1))

439

440 :return: a `TEXT` event

441 :rtype: `tuple`

442 """

443 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)

444

445

446class Markup(text_type):

447 """Marks a string as being safe for inclusion in HTML/XML output without

448 needing to be escaped.

449 """

450 __slots__ = []

451

452 def __add__(self, other):

453 return Markup(text_type.__add__(self, escape(other)))

454

455 def __radd__(self, other):

456 return Markup(text_type.__add__(escape(other), self))

457

458 def __mod__(self, args):

459 if isinstance(args, dict):

460 args = dict(zip(args.keys(), map(escape, args.values())))

461 elif isinstance(args, (list, tuple)):

462 args = tuple(map(escape, args))

463 else:

464 args = escape(args)

465 return Markup(text_type.__mod__(self, args))

466

467 def __mul__(self, num):

468 return Markup(text_type.__mul__(self, num))

469 __rmul__ = __mul__

470

471 def __repr__(self):

472 return "<%s %s>" % (type(self).__name__, text_type.__repr__(self))

473

474 def join(self, seq, escape_quotes=True):

475 """Return a `Markup` object which is the concatenation of the strings

476 in the given sequence, where this `Markup` object is the separator

477 between the joined elements.

478

479 Any element in the sequence that is not a `Markup` instance is

480 automatically escaped.

481

482 :param seq: the sequence of strings to join

483 :param escape_quotes: whether double quote characters in the elements

484 should be escaped

485 :return: the joined `Markup` object

486 :rtype: `Markup`

487 :see: `escape`

488 """

489 escaped_items = [escape(item, quotes=escape_quotes) for item in seq]

490 return Markup(text_type.join(self, escaped_items))

491

492 @classmethod

493 def escape(cls, text, quotes=True):

494 """Create a Markup instance from a string and escape special characters

495 it may contain (<, >, & and \").

496

497 >>> escape('"1 < 2"')

498 <Markup '"1 < 2"'>

499

500 If the `quotes` parameter is set to `False`, the \" character is left

501 as is. Escaping quotes is generally only required for strings that are

502 to be used in attribute values.

503

504 >>> escape('"1 < 2"', quotes=False)

505 <Markup '"1 < 2"'>

506

507 :param text: the text to escape

508 :param quotes: if ``True``, double quote characters are escaped in

509 addition to the other special characters

510 :return: the escaped `Markup` string

511 :rtype: `Markup`

512 """

513 if not text:

514 return cls()

515 if type(text) is cls:

516 return text

517 if hasattr(text, '__html__'):

518 return cls(text.__html__())

519

520 text = text.replace('&', '&') \

521 .replace('<', '<') \

522 .replace('>', '>')

523 if quotes:

524 text = text.replace('"', '"')

525 return cls(text)

526

527 def unescape(self):

528 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.

529

530 >>> Markup('1 < 2').unescape()

531 '1 < 2'

532

533 :return: the unescaped string

534 :rtype: `unicode`

535 :see: `genshi.core.unescape`

536 """

537 if not self:

538 return ''

539 return text_type(self).replace('"', '"') \

540 .replace('>', '>') \

541 .replace('<', '<') \

542 .replace('&', '&')

543

544 def stripentities(self, keepxmlentities=False):

545 """Return a copy of the text with any character or numeric entities

546 replaced by the equivalent UTF-8 characters.

547

548 If the `keepxmlentities` parameter is provided and evaluates to `True`,

549 the core XML entities (``&``, ``'``, ``>``, ``<`` and

550 ``"``) are not stripped.

551

552 :return: a `Markup` instance with entities removed

553 :rtype: `Markup`

554 :see: `genshi.util.stripentities`

555 """

556 return Markup(stripentities(self, keepxmlentities=keepxmlentities))

557

558 def striptags(self):

559 """Return a copy of the text with all XML/HTML tags removed.

560

561 :return: a `Markup` instance with all tags removed

562 :rtype: `Markup`

563 :see: `genshi.util.striptags`

564 """

565 return Markup(striptags(self))

566

567

568try:

569 from genshi._speedups import Markup

570except ImportError:

571 pass # just use the Python implementation

572

573

574escape = Markup.escape

575

576

577def unescape(text):

578 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.

579

580 >>> unescape(Markup('1 < 2'))

581 '1 < 2'

582

583 If the provided `text` object is not a `Markup` instance, it is returned

584 unchanged.

585

586 >>> unescape('1 < 2')

587 '1 < 2'

588

589 :param text: the text to unescape

590 :return: the unescsaped string

591 :rtype: `unicode`

592 """

593 if not isinstance(text, Markup):

594 return text

595 return text.unescape()

596

597

598class Namespace(object):

599 """Utility class creating and testing elements with a namespace.

600

601 Internally, namespace URIs are encoded in the `QName` of any element or

602 attribute, the namespace URI being enclosed in curly braces. This class

603 helps create and test these strings.

604

605 A `Namespace` object is instantiated with the namespace URI.

606

607 >>> html = Namespace('http://www.w3.org/1999/xhtml')

608 >>> html

609 Namespace('http://www.w3.org/1999/xhtml')

610 >>> html.uri

611 'http://www.w3.org/1999/xhtml'

612

613 The `Namespace` object can than be used to generate `QName` objects with

614 that namespace:

615

616 >>> html.body

617 QName('http://www.w3.org/1999/xhtml}body')

618 >>> html.body.localname

619 'body'

620 >>> html.body.namespace

621 'http://www.w3.org/1999/xhtml'

622

623 The same works using item access notation, which is useful for element or

624 attribute names that are not valid Python identifiers:

625

626 >>> html['body']

627 QName('http://www.w3.org/1999/xhtml}body')

628

629 A `Namespace` object can also be used to test whether a specific `QName`

630 belongs to that namespace using the ``in`` operator:

631

632 >>> qname = html.body

633 >>> qname in html

634 True

635 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')

636 False

637 """

638 def __new__(cls, uri):

639 if type(uri) is cls:

640 return uri

641 return object.__new__(cls)

642

643 def __getnewargs__(self):

644 return (self.uri,)

645

646 def __getstate__(self):

647 return self.uri

648

649 def __setstate__(self, uri):

650 self.uri = uri

651

652 def __init__(self, uri):

653 self.uri = text_type(uri)

654

655 def __contains__(self, qname):

656 return qname.namespace == self.uri

657

658 def __ne__(self, other):

659 return not self == other

660

661 def __eq__(self, other):

662 if isinstance(other, Namespace):

663 return self.uri == other.uri

664 return self.uri == other

665

666 def __getitem__(self, name):

667 return QName(self.uri + '}' + name)

668 __getattr__ = __getitem__

669

670 def __hash__(self):

671 return hash(self.uri)

672

673 if sys.version_info[0] == 2:

674 # Only use stringrepr in python 2

675 def __repr__(self):

676 return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))

677 else:

678 def __repr__(self):

679 return '%s(%r)' % (type(self).__name__, self.uri)

680

681 def __str__(self):

682 return self.uri.encode('utf-8')

683

684 def __unicode__(self):

685 return self.uri

686

687

688# The namespace used by attributes such as xml:lang and xml:space

689XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')

690

691

692class QName(text_type):

693 """A qualified element or attribute name.

694

695 The unicode value of instances of this class contains the qualified name of

696 the element or attribute, in the form ``{namespace-uri}local-name``. The

697 namespace URI can be obtained through the additional `namespace` attribute,

698 while the local name can be accessed through the `localname` attribute.

699

700 >>> qname = QName('foo')

701 >>> qname

702 QName('foo')

703 >>> qname.localname

704 'foo'

705 >>> qname.namespace

706

707 >>> qname = QName('http://www.w3.org/1999/xhtml}body')

708 >>> qname

709 QName('http://www.w3.org/1999/xhtml}body')

710 >>> qname.localname

711 'body'

712 >>> qname.namespace

713 'http://www.w3.org/1999/xhtml'

714 """

715 __slots__ = ['namespace', 'localname']

716

717 def __new__(cls, qname):

718 """Create the `QName` instance.

719

720 :param qname: the qualified name as a string of the form

721 ``{namespace-uri}local-name``, where the leading curly

722 brace is optional

723 """

724 if type(qname) is cls:

725 return qname

726

727 qname = qname.lstrip('{')

728 parts = qname.split('}', 1)

729 if len(parts) > 1:

730 self = text_type.__new__(cls, '{%s' % qname)

731 self.namespace, self.localname = map(text_type, parts)

732 else:

733 self = text_type.__new__(cls, qname)

734 self.namespace, self.localname = None, text_type(qname)

735 return self

736

737 def __getnewargs__(self):

738 return (self.lstrip('{'),)

739

740 if sys.version_info[0] == 2:

741 # Only use stringrepr in python 2

742 def __repr__(self):

743 return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))

744 else:

745 def __repr__(self):

746 return '%s(%r)' % (type(self).__name__, self.lstrip('{'))