Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/genshi/core.py: 57%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006-2009 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at http://genshi.edgewall.org/wiki/License.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at http://genshi.edgewall.org/log/.
14"""Core classes for markup processing."""
16from functools import reduce
17import sys
18from itertools import chain
19import operator
21from genshi.compat import stringrepr, string_types, text_type
22from genshi.util import stripentities, striptags
24__all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
25 'QName']
26__docformat__ = 'restructuredtext en'
29class StreamEventKind(str):
30 """A kind of event on a markup stream."""
31 __slots__ = []
32 _instances = {}
34 def __new__(cls, val):
35 return cls._instances.setdefault(val, str.__new__(cls, val))
38class Stream(object):
39 """Represents a stream of markup events.
41 This class is basically an iterator over the events.
43 Stream events are tuples of the form::
45 (kind, data, position)
47 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
48 ``data`` depends on the kind of event, and ``position`` is a
49 ``(filename, line, offset)`` tuple that contains the location of the
50 original element or text in the input. If the original location is unknown,
51 ``position`` is ``(None, -1, -1)``.
53 Also provided are ways to serialize the stream to text. The `serialize()`
54 method will return an iterator over generated strings, while `render()`
55 returns the complete generated text at once. Both accept various parameters
56 that impact the way the stream is serialized.
57 """
58 __slots__ = ['events', 'serializer']
60 START = StreamEventKind('START') #: a start tag
61 END = StreamEventKind('END') #: an end tag
62 TEXT = StreamEventKind('TEXT') #: literal text
63 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
64 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
65 START_NS = StreamEventKind('START_NS') #: start namespace mapping
66 END_NS = StreamEventKind('END_NS') #: end namespace mapping
67 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
68 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
69 PI = StreamEventKind('PI') #: processing instruction
70 COMMENT = StreamEventKind('COMMENT') #: comment
72 def __init__(self, events, serializer=None):
73 """Initialize the stream with a sequence of markup events.
75 :param events: a sequence or iterable providing the events
76 :param serializer: the default serialization method to use for this
77 stream
79 :note: Changed in 0.5: added the `serializer` argument
80 """
81 self.events = events #: The underlying iterable producing the events
82 self.serializer = serializer #: The default serializion method
84 def __iter__(self):
85 return iter(self.events)
87 def __or__(self, function):
88 """Override the "bitwise or" operator to apply filters or serializers
89 to the stream, providing a syntax similar to pipes on Unix shells.
91 Assume the following stream produced by the `HTML` function:
93 >>> from genshi.input import HTML
94 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''', encoding='utf-8')
95 >>> print(html)
96 <p onclick="alert('Whoa')">Hello, world!</p>
98 A filter such as the HTML sanitizer can be applied to that stream using
99 the pipe notation as follows:
101 >>> from genshi.filters import HTMLSanitizer
102 >>> sanitizer = HTMLSanitizer()
103 >>> print(html | sanitizer)
104 <p>Hello, world!</p>
106 Filters can be any function that accepts and produces a stream (where
107 a stream is anything that iterates over events):
109 >>> def uppercase(stream):
110 ... for kind, data, pos in stream:
111 ... if kind is TEXT:
112 ... data = data.upper()
113 ... yield kind, data, pos
114 >>> print(html | sanitizer | uppercase)
115 <p>HELLO, WORLD!</p>
117 Serializers can also be used with this notation:
119 >>> from genshi.output import TextSerializer
120 >>> output = TextSerializer()
121 >>> print(html | sanitizer | uppercase | output)
122 HELLO, WORLD!
124 Commonly, serializers should be used at the end of the "pipeline";
125 using them somewhere in the middle may produce unexpected results.
127 :param function: the callable object that should be applied as a filter
128 :return: the filtered stream
129 :rtype: `Stream`
130 """
131 return Stream(_ensure(function(self)), serializer=self.serializer)
133 def filter(self, *filters):
134 """Apply filters to the stream.
136 This method returns a new stream with the given filters applied. The
137 filters must be callables that accept the stream object as parameter,
138 and return the filtered stream.
140 The call::
142 stream.filter(filter1, filter2)
144 is equivalent to::
146 stream | filter1 | filter2
148 :param filters: one or more callable objects that should be applied as
149 filters
150 :return: the filtered stream
151 :rtype: `Stream`
152 """
153 return reduce(operator.or_, (self,) + filters)
155 def render(self, method=None, encoding=None, out=None, **kwargs):
156 """Return a string representation of the stream.
158 Any additional keyword arguments are passed to the serializer, and thus
159 depend on the `method` parameter value.
161 :param method: determines how the stream is serialized; can be either
162 "xml", "xhtml", "html", "text", or a custom serializer
163 class; if `None`, the default serialization method of
164 the stream is used
165 :param encoding: how the output string should be encoded; if set to
166 `None`, this method returns a `unicode` object
167 :param out: a file-like object that the output should be written to
168 instead of being returned as one big string; note that if
169 this is a file or socket (or similar), the `encoding` must
170 not be `None` (that is, the output must be encoded)
171 :return: a `str` or `unicode` object (depending on the `encoding`
172 parameter), or `None` if the `out` parameter is provided
173 :rtype: `basestring`
175 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
176 :note: Changed in 0.5: added the `out` parameter
177 """
178 from genshi.output import encode
179 if method is None:
180 method = self.serializer or 'xml'
181 generator = self.serialize(method=method, **kwargs)
182 return encode(generator, method=method, encoding=encoding, out=out)
184 def select(self, path, namespaces=None, variables=None):
185 """Return a new stream that contains the events matching the given
186 XPath expression.
188 >>> from genshi import HTML
189 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>', encoding='utf-8')
190 >>> print(stream.select('elem'))
191 <elem>foo</elem><elem>bar</elem>
192 >>> print(stream.select('elem/text()'))
193 foobar
195 Note that the outermost element of the stream becomes the *context
196 node* for the XPath test. That means that the expression "doc" would
197 not match anything in the example above, because it only tests against
198 child elements of the outermost element:
200 >>> print(stream.select('doc'))
201 <BLANKLINE>
203 You can use the "." expression to match the context node itself
204 (although that usually makes little sense):
206 >>> print(stream.select('.'))
207 <doc><elem>foo</elem><elem>bar</elem></doc>
209 :param path: a string containing the XPath expression
210 :param namespaces: mapping of namespace prefixes used in the path
211 :param variables: mapping of variable names to values
212 :return: the selected substream
213 :rtype: `Stream`
214 :raises PathSyntaxError: if the given path expression is invalid or not
215 supported
216 """
217 from genshi.path import Path
218 return Path(path).select(self, namespaces, variables)
220 def serialize(self, method='xml', **kwargs):
221 """Generate strings corresponding to a specific serialization of the
222 stream.
224 Unlike the `render()` method, this method is a generator that returns
225 the serialized output incrementally, as opposed to returning a single
226 string.
228 Any additional keyword arguments are passed to the serializer, and thus
229 depend on the `method` parameter value.
231 :param method: determines how the stream is serialized; can be either
232 "xml", "xhtml", "html", "text", or a custom serializer
233 class; if `None`, the default serialization method of
234 the stream is used
235 :return: an iterator over the serialization results (`Markup` or
236 `unicode` objects, depending on the serialization method)
237 :rtype: ``iterator``
238 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
239 """
240 from genshi.output import get_serializer
241 if method is None:
242 method = self.serializer or 'xml'
243 return get_serializer(method, **kwargs)(_ensure(self))
245 def __str__(self):
246 return self.render()
248 def __unicode__(self):
249 return self.render(encoding=None)
251 def __html__(self):
252 return self
255START = Stream.START
256END = Stream.END
257TEXT = Stream.TEXT
258XML_DECL = Stream.XML_DECL
259DOCTYPE = Stream.DOCTYPE
260START_NS = Stream.START_NS
261END_NS = Stream.END_NS
262START_CDATA = Stream.START_CDATA
263END_CDATA = Stream.END_CDATA
264PI = Stream.PI
265COMMENT = Stream.COMMENT
268def _ensure(stream):
269 """Ensure that every item on the stream is actually a markup event."""
270 stream = iter(stream)
271 try:
272 event = next(stream)
273 except StopIteration:
274 return
276 # Check whether the iterable is a real markup event stream by examining the
277 # first item it yields; if it's not we'll need to do some conversion
278 if type(event) is not tuple or len(event) != 3:
279 for event in chain([event], stream):
280 if hasattr(event, 'totuple'):
281 event = event.totuple()
282 else:
283 event = TEXT, text_type(event), (None, -1, -1)
284 yield event
285 return
287 # This looks like a markup event stream, so we'll just pass it through
288 # unchanged
289 yield event
290 for event in stream:
291 yield event
294class Attrs(tuple):
295 """Immutable sequence type that stores the attributes of an element.
297 Ordering of the attributes is preserved, while access by name is also
298 supported.
300 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
301 >>> attrs
302 Attrs([('href', '#'), ('title', 'Foo')])
304 >>> 'href' in attrs
305 True
306 >>> 'tabindex' in attrs
307 False
308 >>> attrs.get('title')
309 'Foo'
311 Instances may not be manipulated directly. Instead, the operators ``|`` and
312 ``-`` can be used to produce new instances that have specific attributes
313 added, replaced or removed.
315 To remove an attribute, use the ``-`` operator. The right hand side can be
316 either a string or a set/sequence of strings, identifying the name(s) of
317 the attribute(s) to remove:
319 >>> attrs - 'title'
320 Attrs([('href', '#')])
321 >>> attrs - ('title', 'href')
322 Attrs()
324 The original instance is not modified, but the operator can of course be
325 used with an assignment:
327 >>> attrs
328 Attrs([('href', '#'), ('title', 'Foo')])
329 >>> attrs -= 'title'
330 >>> attrs
331 Attrs([('href', '#')])
333 To add a new attribute, use the ``|`` operator, where the right hand value
334 is a sequence of ``(name, value)`` tuples (which includes `Attrs`
335 instances):
337 >>> attrs | [('title', 'Bar')]
338 Attrs([('href', '#'), ('title', 'Bar')])
340 If the attributes already contain an attribute with a given name, the value
341 of that attribute is replaced:
343 >>> attrs | [('href', 'http://example.org/')]
344 Attrs([('href', 'http://example.org/')])
345 """
346 __slots__ = []
348 def __contains__(self, name):
349 """Return whether the list includes an attribute with the specified
350 name.
352 :return: `True` if the list includes the attribute
353 :rtype: `bool`
354 """
355 for attr, _ in self:
356 if attr == name:
357 return True
358 return False
360 def __getitem__(self, i):
361 """Return an item or slice of the attributes list.
363 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
364 >>> attrs[1]
365 ('title', 'Foo')
366 >>> attrs[1:]
367 Attrs([('title', 'Foo')])
368 """
369 items = tuple.__getitem__(self, i)
370 if type(i) is slice:
371 return Attrs(items)
372 return items
374 def __getslice__(self, i, j):
375 """Return a slice of the attributes list.
377 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
378 >>> attrs[1:]
379 Attrs([('title', 'Foo')])
380 """
381 return Attrs(tuple.__getslice__(self, i, j))
383 def __or__(self, attrs):
384 """Return a new instance that contains the attributes in `attrs` in
385 addition to any already existing attributes. Any attributes in the new
386 set that have a value of `None` are removed.
388 :return: a new instance with the merged attributes
389 :rtype: `Attrs`
390 """
391 remove = set([an for an, av in attrs if av is None])
392 replace = dict([(an, av) for an, av in attrs
393 if an in self and av is not None])
394 return Attrs([(sn, replace.get(sn, sv)) for sn, sv in self
395 if sn not in remove] +
396 [(an, av) for an, av in attrs
397 if an not in self and an not in remove])
399 def __repr__(self):
400 if not self:
401 return 'Attrs()'
402 return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
404 def __sub__(self, names):
405 """Return a new instance with all attributes with a name in `names` are
406 removed.
408 :param names: the names of the attributes to remove
409 :return: a new instance with the attribute removed
410 :rtype: `Attrs`
411 """
412 if isinstance(names, string_types):
413 names = (names,)
414 return Attrs([(name, val) for name, val in self if name not in names])
416 def get(self, name, default=None):
417 """Return the value of the attribute with the specified name, or the
418 value of the `default` parameter if no such attribute is found.
420 :param name: the name of the attribute
421 :param default: the value to return when the attribute does not exist
422 :return: the attribute value, or the `default` value if that attribute
423 does not exist
424 :rtype: `object`
425 """
426 for attr, value in self:
427 if attr == name:
428 return value
429 return default
431 def totuple(self):
432 """Return the attributes as a markup event.
434 The returned event is a `TEXT` event, the data is the value of all
435 attributes joined together.
437 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
438 ('TEXT', '#Foo', (None, -1, -1))
440 :return: a `TEXT` event
441 :rtype: `tuple`
442 """
443 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
446class Markup(text_type):
447 """Marks a string as being safe for inclusion in HTML/XML output without
448 needing to be escaped.
449 """
450 __slots__ = []
452 def __add__(self, other):
453 return Markup(text_type.__add__(self, escape(other)))
455 def __radd__(self, other):
456 return Markup(text_type.__add__(escape(other), self))
458 def __mod__(self, args):
459 if isinstance(args, dict):
460 args = dict(zip(args.keys(), map(escape, args.values())))
461 elif isinstance(args, (list, tuple)):
462 args = tuple(map(escape, args))
463 else:
464 args = escape(args)
465 return Markup(text_type.__mod__(self, args))
467 def __mul__(self, num):
468 return Markup(text_type.__mul__(self, num))
469 __rmul__ = __mul__
471 def __repr__(self):
472 return "<%s %s>" % (type(self).__name__, text_type.__repr__(self))
474 def join(self, seq, escape_quotes=True):
475 """Return a `Markup` object which is the concatenation of the strings
476 in the given sequence, where this `Markup` object is the separator
477 between the joined elements.
479 Any element in the sequence that is not a `Markup` instance is
480 automatically escaped.
482 :param seq: the sequence of strings to join
483 :param escape_quotes: whether double quote characters in the elements
484 should be escaped
485 :return: the joined `Markup` object
486 :rtype: `Markup`
487 :see: `escape`
488 """
489 escaped_items = [escape(item, quotes=escape_quotes) for item in seq]
490 return Markup(text_type.join(self, escaped_items))
492 @classmethod
493 def escape(cls, text, quotes=True):
494 """Create a Markup instance from a string and escape special characters
495 it may contain (<, >, & and \").
497 >>> escape('"1 < 2"')
498 <Markup '"1 < 2"'>
500 If the `quotes` parameter is set to `False`, the \" character is left
501 as is. Escaping quotes is generally only required for strings that are
502 to be used in attribute values.
504 >>> escape('"1 < 2"', quotes=False)
505 <Markup '"1 < 2"'>
507 :param text: the text to escape
508 :param quotes: if ``True``, double quote characters are escaped in
509 addition to the other special characters
510 :return: the escaped `Markup` string
511 :rtype: `Markup`
512 """
513 if not text:
514 return cls()
515 if type(text) is cls:
516 return text
517 if hasattr(text, '__html__'):
518 return cls(text.__html__())
520 text = text.replace('&', '&') \
521 .replace('<', '<') \
522 .replace('>', '>')
523 if quotes:
524 text = text.replace('"', '"')
525 return cls(text)
527 def unescape(self):
528 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
530 >>> Markup('1 < 2').unescape()
531 '1 < 2'
533 :return: the unescaped string
534 :rtype: `unicode`
535 :see: `genshi.core.unescape`
536 """
537 if not self:
538 return ''
539 return text_type(self).replace('"', '"') \
540 .replace('>', '>') \
541 .replace('<', '<') \
542 .replace('&', '&')
544 def stripentities(self, keepxmlentities=False):
545 """Return a copy of the text with any character or numeric entities
546 replaced by the equivalent UTF-8 characters.
548 If the `keepxmlentities` parameter is provided and evaluates to `True`,
549 the core XML entities (``&``, ``'``, ``>``, ``<`` and
550 ``"``) are not stripped.
552 :return: a `Markup` instance with entities removed
553 :rtype: `Markup`
554 :see: `genshi.util.stripentities`
555 """
556 return Markup(stripentities(self, keepxmlentities=keepxmlentities))
558 def striptags(self):
559 """Return a copy of the text with all XML/HTML tags removed.
561 :return: a `Markup` instance with all tags removed
562 :rtype: `Markup`
563 :see: `genshi.util.striptags`
564 """
565 return Markup(striptags(self))
568try:
569 from genshi._speedups import Markup
570except ImportError:
571 pass # just use the Python implementation
574escape = Markup.escape
577def unescape(text):
578 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
580 >>> unescape(Markup('1 < 2'))
581 '1 < 2'
583 If the provided `text` object is not a `Markup` instance, it is returned
584 unchanged.
586 >>> unescape('1 < 2')
587 '1 < 2'
589 :param text: the text to unescape
590 :return: the unescsaped string
591 :rtype: `unicode`
592 """
593 if not isinstance(text, Markup):
594 return text
595 return text.unescape()
598class Namespace(object):
599 """Utility class creating and testing elements with a namespace.
601 Internally, namespace URIs are encoded in the `QName` of any element or
602 attribute, the namespace URI being enclosed in curly braces. This class
603 helps create and test these strings.
605 A `Namespace` object is instantiated with the namespace URI.
607 >>> html = Namespace('http://www.w3.org/1999/xhtml')
608 >>> html
609 Namespace('http://www.w3.org/1999/xhtml')
610 >>> html.uri
611 'http://www.w3.org/1999/xhtml'
613 The `Namespace` object can than be used to generate `QName` objects with
614 that namespace:
616 >>> html.body
617 QName('http://www.w3.org/1999/xhtml}body')
618 >>> html.body.localname
619 'body'
620 >>> html.body.namespace
621 'http://www.w3.org/1999/xhtml'
623 The same works using item access notation, which is useful for element or
624 attribute names that are not valid Python identifiers:
626 >>> html['body']
627 QName('http://www.w3.org/1999/xhtml}body')
629 A `Namespace` object can also be used to test whether a specific `QName`
630 belongs to that namespace using the ``in`` operator:
632 >>> qname = html.body
633 >>> qname in html
634 True
635 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
636 False
637 """
638 def __new__(cls, uri):
639 if type(uri) is cls:
640 return uri
641 return object.__new__(cls)
643 def __getnewargs__(self):
644 return (self.uri,)
646 def __getstate__(self):
647 return self.uri
649 def __setstate__(self, uri):
650 self.uri = uri
652 def __init__(self, uri):
653 self.uri = text_type(uri)
655 def __contains__(self, qname):
656 return qname.namespace == self.uri
658 def __ne__(self, other):
659 return not self == other
661 def __eq__(self, other):
662 if isinstance(other, Namespace):
663 return self.uri == other.uri
664 return self.uri == other
666 def __getitem__(self, name):
667 return QName(self.uri + '}' + name)
668 __getattr__ = __getitem__
670 def __hash__(self):
671 return hash(self.uri)
673 if sys.version_info[0] == 2:
674 # Only use stringrepr in python 2
675 def __repr__(self):
676 return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
677 else:
678 def __repr__(self):
679 return '%s(%r)' % (type(self).__name__, self.uri)
681 def __str__(self):
682 return self.uri.encode('utf-8')
684 def __unicode__(self):
685 return self.uri
688# The namespace used by attributes such as xml:lang and xml:space
689XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
692class QName(text_type):
693 """A qualified element or attribute name.
695 The unicode value of instances of this class contains the qualified name of
696 the element or attribute, in the form ``{namespace-uri}local-name``. The
697 namespace URI can be obtained through the additional `namespace` attribute,
698 while the local name can be accessed through the `localname` attribute.
700 >>> qname = QName('foo')
701 >>> qname
702 QName('foo')
703 >>> qname.localname
704 'foo'
705 >>> qname.namespace
707 >>> qname = QName('http://www.w3.org/1999/xhtml}body')
708 >>> qname
709 QName('http://www.w3.org/1999/xhtml}body')
710 >>> qname.localname
711 'body'
712 >>> qname.namespace
713 'http://www.w3.org/1999/xhtml'
714 """
715 __slots__ = ['namespace', 'localname']
717 def __new__(cls, qname):
718 """Create the `QName` instance.
720 :param qname: the qualified name as a string of the form
721 ``{namespace-uri}local-name``, where the leading curly
722 brace is optional
723 """
724 if type(qname) is cls:
725 return qname
727 qname = qname.lstrip('{')
728 parts = qname.split('}', 1)
729 if len(parts) > 1:
730 self = text_type.__new__(cls, '{%s' % qname)
731 self.namespace, self.localname = map(text_type, parts)
732 else:
733 self = text_type.__new__(cls, qname)
734 self.namespace, self.localname = None, text_type(qname)
735 return self
737 def __getnewargs__(self):
738 return (self.lstrip('{'),)
740 if sys.version_info[0] == 2:
741 # Only use stringrepr in python 2
742 def __repr__(self):
743 return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
744 else:
745 def __repr__(self):
746 return '%s(%r)' % (type(self).__name__, self.lstrip('{'))