Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/xml/etree/ElementTree.py: 17%
1020 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
1"""Lightweight XML support for Python.
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
6 1. ElementTree represents the whole XML document as a tree and
8 2. Element represents a single node in this tree.
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
18 'tag' - a string containing the element's name.
20 'attributes' - a Python dictionary storing the element's attributes.
22 'text' - a string containing the element's text content.
24 'tail' - an optional string containing text after the element's end tag.
26 And a number of child elements stored in a Python sequence.
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
34"""
36#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
39#
40# ElementTree
41# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
42#
43# fredrik@pythonware.com
44# http://www.pythonware.com
45# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
48# Copyright (c) 1999-2008 by Fredrik Lundh
49#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring", "fromstringlist",
79 "indent", "iselement", "iterparse",
80 "parse", "ParseError",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring", "tostringlist",
85 "TreeBuilder",
86 "VERSION",
87 "XML", "XMLID",
88 "XMLParser", "XMLPullParser",
89 "register_namespace",
90 "canonicalize", "C14NWriterTarget",
91 ]
93VERSION = "1.3.0"
95import sys
96import re
97import warnings
98import io
99import collections
100import collections.abc
101import contextlib
103from . import ElementPath
106class ParseError(SyntaxError):
107 """An error when parsing an XML document.
109 In addition to its exception value, a ParseError contains
110 two extra attributes:
111 'code' - the specific exception code
112 'position' - the line and column of the error
114 """
115 pass
117# --------------------------------------------------------------------
120def iselement(element):
121 """Return True if *element* appears to be an Element."""
122 return hasattr(element, 'tag')
125class Element:
126 """An XML element.
128 This class is the reference implementation of the Element interface.
130 An element's length is its number of subelements. That means if you
131 want to check if an element is truly empty, you should check BOTH
132 its length AND its text attribute.
134 The element tag, attribute names, and attribute values can be either
135 bytes or strings.
137 *tag* is the element name. *attrib* is an optional dictionary containing
138 element attributes. *extra* are additional element attributes given as
139 keyword arguments.
141 Example form:
142 <tag attrib>text<child/>...</tag>tail
144 """
146 tag = None
147 """The element's name."""
149 attrib = None
150 """Dictionary of the element's attributes."""
152 text = None
153 """
154 Text before first subelement. This is either a string or the value None.
155 Note that if there is no text, this attribute may be either
156 None or the empty string, depending on the parser.
158 """
160 tail = None
161 """
162 Text after this element's end tag, but before the next sibling element's
163 start tag. This is either a string or the value None. Note that if there
164 was no text, this attribute may be either None or an empty string,
165 depending on the parser.
167 """
169 def __init__(self, tag, attrib={}, **extra):
170 if not isinstance(attrib, dict):
171 raise TypeError("attrib must be dict, not %s" % (
172 attrib.__class__.__name__,))
173 self.tag = tag
174 self.attrib = {**attrib, **extra}
175 self._children = []
177 def __repr__(self):
178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
180 def makeelement(self, tag, attrib):
181 """Create a new element with the same type.
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
186 Do not call this method, use the SubElement factory function instead.
188 """
189 return self.__class__(tag, attrib)
191 def copy(self):
192 """Return copy of current element.
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
197 """
198 warnings.warn(
199 "elem.copy() is deprecated. Use copy.copy(elem) instead.",
200 DeprecationWarning
201 )
202 return self.__copy__()
204 def __copy__(self):
205 elem = self.makeelement(self.tag, self.attrib)
206 elem.text = self.text
207 elem.tail = self.tail
208 elem[:] = self
209 return elem
211 def __len__(self):
212 return len(self._children)
214 def __bool__(self):
215 warnings.warn(
216 "The behavior of this method will change in future versions. "
217 "Use specific 'len(elem)' or 'elem is not None' test instead.",
218 FutureWarning, stacklevel=2
219 )
220 return len(self._children) != 0 # emulate old behaviour, for now
222 def __getitem__(self, index):
223 return self._children[index]
225 def __setitem__(self, index, element):
226 if isinstance(index, slice):
227 for elt in element:
228 self._assert_is_element(elt)
229 else:
230 self._assert_is_element(element)
231 self._children[index] = element
233 def __delitem__(self, index):
234 del self._children[index]
236 def append(self, subelement):
237 """Add *subelement* to the end of this element.
239 The new element will appear in document order after the last existing
240 subelement (or directly after the text, if it's the first subelement),
241 but before the end tag for this element.
243 """
244 self._assert_is_element(subelement)
245 self._children.append(subelement)
247 def extend(self, elements):
248 """Append subelements from a sequence.
250 *elements* is a sequence with zero or more elements.
252 """
253 for element in elements:
254 self._assert_is_element(element)
255 self._children.append(element)
257 def insert(self, index, subelement):
258 """Insert *subelement* at position *index*."""
259 self._assert_is_element(subelement)
260 self._children.insert(index, subelement)
262 def _assert_is_element(self, e):
263 # Need to refer to the actual Python implementation, not the
264 # shadowing C implementation.
265 if not isinstance(e, _Element_Py):
266 raise TypeError('expected an Element, not %s' % type(e).__name__)
268 def remove(self, subelement):
269 """Remove matching subelement.
271 Unlike the find methods, this method compares elements based on
272 identity, NOT ON tag value or contents. To remove subelements by
273 other means, the easiest way is to use a list comprehension to
274 select what elements to keep, and then use slice assignment to update
275 the parent element.
277 ValueError is raised if a matching element could not be found.
279 """
280 # assert iselement(element)
281 self._children.remove(subelement)
283 def find(self, path, namespaces=None):
284 """Find first matching element by tag name or path.
286 *path* is a string having either an element tag or an XPath,
287 *namespaces* is an optional mapping from namespace prefix to full name.
289 Return the first matching element, or None if no element was found.
291 """
292 return ElementPath.find(self, path, namespaces)
294 def findtext(self, path, default=None, namespaces=None):
295 """Find text for first matching element by tag name or path.
297 *path* is a string having either an element tag or an XPath,
298 *default* is the value to return if the element was not found,
299 *namespaces* is an optional mapping from namespace prefix to full name.
301 Return text content of first matching element, or default value if
302 none was found. Note that if an element is found having no text
303 content, the empty string is returned.
305 """
306 return ElementPath.findtext(self, path, default, namespaces)
308 def findall(self, path, namespaces=None):
309 """Find all matching subelements by tag name or path.
311 *path* is a string having either an element tag or an XPath,
312 *namespaces* is an optional mapping from namespace prefix to full name.
314 Returns list containing all matching elements in document order.
316 """
317 return ElementPath.findall(self, path, namespaces)
319 def iterfind(self, path, namespaces=None):
320 """Find all matching subelements by tag name or path.
322 *path* is a string having either an element tag or an XPath,
323 *namespaces* is an optional mapping from namespace prefix to full name.
325 Return an iterable yielding all matching elements in document order.
327 """
328 return ElementPath.iterfind(self, path, namespaces)
330 def clear(self):
331 """Reset element.
333 This function removes all subelements, clears all attributes, and sets
334 the text and tail attributes to None.
336 """
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
341 def get(self, key, default=None):
342 """Get element attribute.
344 Equivalent to attrib.get, but some implementations may handle this a
345 bit more efficiently. *key* is what attribute to look for, and
346 *default* is what to return if the attribute was not found.
348 Returns a string containing the attribute value, or the default if
349 attribute was not found.
351 """
352 return self.attrib.get(key, default)
354 def set(self, key, value):
355 """Set element attribute.
357 Equivalent to attrib[key] = value, but some implementations may handle
358 this a bit more efficiently. *key* is what attribute to set, and
359 *value* is the attribute value to set it to.
361 """
362 self.attrib[key] = value
364 def keys(self):
365 """Get list of attribute names.
367 Names are returned in an arbitrary order, just like an ordinary
368 Python dict. Equivalent to attrib.keys()
370 """
371 return self.attrib.keys()
373 def items(self):
374 """Get element attributes as a sequence.
376 The attributes are returned in arbitrary order. Equivalent to
377 attrib.items().
379 Return a list of (name, value) tuples.
381 """
382 return self.attrib.items()
384 def iter(self, tag=None):
385 """Create tree iterator.
387 The iterator loops over the element and all subelements in document
388 order, returning all elements with a matching tag.
390 If the tree structure is modified during iteration, new or removed
391 elements may or may not be included. To get a stable set, use the
392 list() function on the iterator, and loop over the resulting list.
394 *tag* is what tags to look for (default is to return all elements)
396 Return an iterator containing all the matching elements.
398 """
399 if tag == "*":
400 tag = None
401 if tag is None or self.tag == tag:
402 yield self
403 for e in self._children:
404 yield from e.iter(tag)
406 def itertext(self):
407 """Create text iterator.
409 The iterator loops over the element and all subelements in document
410 order, returning all inner text.
412 """
413 tag = self.tag
414 if not isinstance(tag, str) and tag is not None:
415 return
416 t = self.text
417 if t:
418 yield t
419 for e in self:
420 yield from e.itertext()
421 t = e.tail
422 if t:
423 yield t
426def SubElement(parent, tag, attrib={}, **extra):
427 """Subelement factory which creates an element instance, and appends it
428 to an existing parent.
430 The element tag, attribute names, and attribute values can be either
431 bytes or Unicode strings.
433 *parent* is the parent element, *tag* is the subelements name, *attrib* is
434 an optional directory containing element attributes, *extra* are
435 additional attributes given as keyword arguments.
437 """
438 attrib = {**attrib, **extra}
439 element = parent.makeelement(tag, attrib)
440 parent.append(element)
441 return element
444def Comment(text=None):
445 """Comment element factory.
447 This function creates a special element which the standard serializer
448 serializes as an XML comment.
450 *text* is a string containing the comment string.
452 """
453 element = Element(Comment)
454 element.text = text
455 return element
458def ProcessingInstruction(target, text=None):
459 """Processing Instruction element factory.
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
464 *target* is a string containing the processing instruction, *text* is a
465 string containing the processing instruction contents, if any.
467 """
468 element = Element(ProcessingInstruction)
469 element.text = target
470 if text:
471 element.text = element.text + " " + text
472 return element
474PI = ProcessingInstruction
477class QName:
478 """Qualified name wrapper.
480 This class can be used to wrap a QName attribute value in order to get
481 proper namespace handing on output.
483 *text_or_uri* is a string containing the QName value either in the form
484 {uri}local, or if the tag argument is given, the URI part of a QName.
486 *tag* is an optional argument which if given, will make the first
487 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
488 be interpreted as a local name.
490 """
491 def __init__(self, text_or_uri, tag=None):
492 if tag:
493 text_or_uri = "{%s}%s" % (text_or_uri, tag)
494 self.text = text_or_uri
495 def __str__(self):
496 return self.text
497 def __repr__(self):
498 return '<%s %r>' % (self.__class__.__name__, self.text)
499 def __hash__(self):
500 return hash(self.text)
501 def __le__(self, other):
502 if isinstance(other, QName):
503 return self.text <= other.text
504 return self.text <= other
505 def __lt__(self, other):
506 if isinstance(other, QName):
507 return self.text < other.text
508 return self.text < other
509 def __ge__(self, other):
510 if isinstance(other, QName):
511 return self.text >= other.text
512 return self.text >= other
513 def __gt__(self, other):
514 if isinstance(other, QName):
515 return self.text > other.text
516 return self.text > other
517 def __eq__(self, other):
518 if isinstance(other, QName):
519 return self.text == other.text
520 return self.text == other
522# --------------------------------------------------------------------
525class ElementTree:
526 """An XML element hierarchy.
528 This class also provides support for serialization to and from
529 standard XML.
531 *element* is an optional root element node,
532 *file* is an optional file handle or file name of an XML file whose
533 contents will be used to initialize the tree with.
535 """
536 def __init__(self, element=None, file=None):
537 # assert element is None or iselement(element)
538 self._root = element # first node
539 if file:
540 self.parse(file)
542 def getroot(self):
543 """Return root element of this tree."""
544 return self._root
546 def _setroot(self, element):
547 """Replace root element of this tree.
549 This will discard the current contents of the tree and replace it
550 with the given element. Use with care!
552 """
553 # assert iselement(element)
554 self._root = element
556 def parse(self, source, parser=None):
557 """Load external XML document into element tree.
559 *source* is a file name or file object, *parser* is an optional parser
560 instance that defaults to XMLParser.
562 ParseError is raised if the parser fails to parse the document.
564 Returns the root element of the given source document.
566 """
567 close_source = False
568 if not hasattr(source, "read"):
569 source = open(source, "rb")
570 close_source = True
571 try:
572 if parser is None:
573 # If no parser was specified, create a default XMLParser
574 parser = XMLParser()
575 if hasattr(parser, '_parse_whole'):
576 # The default XMLParser, when it comes from an accelerator,
577 # can define an internal _parse_whole API for efficiency.
578 # It can be used to parse the whole source without feeding
579 # it with chunks.
580 self._root = parser._parse_whole(source)
581 return self._root
582 while True:
583 data = source.read(65536)
584 if not data:
585 break
586 parser.feed(data)
587 self._root = parser.close()
588 return self._root
589 finally:
590 if close_source:
591 source.close()
593 def iter(self, tag=None):
594 """Create and return tree iterator for the root element.
596 The iterator loops over all elements in this tree, in document order.
598 *tag* is a string with the tag name to iterate over
599 (default is to return all elements).
601 """
602 # assert self._root is not None
603 return self._root.iter(tag)
605 def find(self, path, namespaces=None):
606 """Find first matching element by tag name or path.
608 Same as getroot().find(path), which is Element.find()
610 *path* is a string having either an element tag or an XPath,
611 *namespaces* is an optional mapping from namespace prefix to full name.
613 Return the first matching element, or None if no element was found.
615 """
616 # assert self._root is not None
617 if path[:1] == "/":
618 path = "." + path
619 warnings.warn(
620 "This search is broken in 1.3 and earlier, and will be "
621 "fixed in a future version. If you rely on the current "
622 "behaviour, change it to %r" % path,
623 FutureWarning, stacklevel=2
624 )
625 return self._root.find(path, namespaces)
627 def findtext(self, path, default=None, namespaces=None):
628 """Find first matching element by tag name or path.
630 Same as getroot().findtext(path), which is Element.findtext()
632 *path* is a string having either an element tag or an XPath,
633 *namespaces* is an optional mapping from namespace prefix to full name.
635 Return the first matching element, or None if no element was found.
637 """
638 # assert self._root is not None
639 if path[:1] == "/":
640 path = "." + path
641 warnings.warn(
642 "This search is broken in 1.3 and earlier, and will be "
643 "fixed in a future version. If you rely on the current "
644 "behaviour, change it to %r" % path,
645 FutureWarning, stacklevel=2
646 )
647 return self._root.findtext(path, default, namespaces)
649 def findall(self, path, namespaces=None):
650 """Find all matching subelements by tag name or path.
652 Same as getroot().findall(path), which is Element.findall().
654 *path* is a string having either an element tag or an XPath,
655 *namespaces* is an optional mapping from namespace prefix to full name.
657 Return list containing all matching elements in document order.
659 """
660 # assert self._root is not None
661 if path[:1] == "/":
662 path = "." + path
663 warnings.warn(
664 "This search is broken in 1.3 and earlier, and will be "
665 "fixed in a future version. If you rely on the current "
666 "behaviour, change it to %r" % path,
667 FutureWarning, stacklevel=2
668 )
669 return self._root.findall(path, namespaces)
671 def iterfind(self, path, namespaces=None):
672 """Find all matching subelements by tag name or path.
674 Same as getroot().iterfind(path), which is element.iterfind()
676 *path* is a string having either an element tag or an XPath,
677 *namespaces* is an optional mapping from namespace prefix to full name.
679 Return an iterable yielding all matching elements in document order.
681 """
682 # assert self._root is not None
683 if path[:1] == "/":
684 path = "." + path
685 warnings.warn(
686 "This search is broken in 1.3 and earlier, and will be "
687 "fixed in a future version. If you rely on the current "
688 "behaviour, change it to %r" % path,
689 FutureWarning, stacklevel=2
690 )
691 return self._root.iterfind(path, namespaces)
693 def write(self, file_or_filename,
694 encoding=None,
695 xml_declaration=None,
696 default_namespace=None,
697 method=None, *,
698 short_empty_elements=True):
699 """Write element tree to a file as XML.
701 Arguments:
702 *file_or_filename* -- file name or a file object opened for writing
704 *encoding* -- the output encoding (default: US-ASCII)
706 *xml_declaration* -- bool indicating if an XML declaration should be
707 added to the output. If None, an XML declaration
708 is added if encoding IS NOT either of:
709 US-ASCII, UTF-8, or Unicode
711 *default_namespace* -- sets the default XML namespace (for "xmlns")
713 *method* -- either "xml" (default), "html, "text", or "c14n"
715 *short_empty_elements* -- controls the formatting of elements
716 that contain no content. If True (default)
717 they are emitted as a single self-closed
718 tag, otherwise they are emitted as a pair
719 of start/end tags
721 """
722 if not method:
723 method = "xml"
724 elif method not in _serialize:
725 raise ValueError("unknown method %r" % method)
726 if not encoding:
727 if method == "c14n":
728 encoding = "utf-8"
729 else:
730 encoding = "us-ascii"
731 enc_lower = encoding.lower()
732 with _get_writer(file_or_filename, enc_lower) as write:
733 if method == "xml" and (xml_declaration or
734 (xml_declaration is None and
735 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
736 declared_encoding = encoding
737 if enc_lower == "unicode":
738 # Retrieve the default encoding for the xml declaration
739 import locale
740 declared_encoding = locale.getpreferredencoding()
741 write("<?xml version='1.0' encoding='%s'?>\n" % (
742 declared_encoding,))
743 if method == "text":
744 _serialize_text(write, self._root)
745 else:
746 qnames, namespaces = _namespaces(self._root, default_namespace)
747 serialize = _serialize[method]
748 serialize(write, self._root, qnames, namespaces,
749 short_empty_elements=short_empty_elements)
751 def write_c14n(self, file):
752 # lxml.etree compatibility. use output method instead
753 return self.write(file, method="c14n")
755# --------------------------------------------------------------------
756# serialization support
758@contextlib.contextmanager
759def _get_writer(file_or_filename, encoding):
760 # returns text write method and release all resources after using
761 try:
762 write = file_or_filename.write
763 except AttributeError:
764 # file_or_filename is a file name
765 if encoding == "unicode":
766 file = open(file_or_filename, "w")
767 else:
768 file = open(file_or_filename, "w", encoding=encoding,
769 errors="xmlcharrefreplace")
770 with file:
771 yield file.write
772 else:
773 # file_or_filename is a file-like object
774 # encoding determines if it is a text or binary writer
775 if encoding == "unicode":
776 # use a text writer as is
777 yield write
778 else:
779 # wrap a binary writer with TextIOWrapper
780 with contextlib.ExitStack() as stack:
781 if isinstance(file_or_filename, io.BufferedIOBase):
782 file = file_or_filename
783 elif isinstance(file_or_filename, io.RawIOBase):
784 file = io.BufferedWriter(file_or_filename)
785 # Keep the original file open when the BufferedWriter is
786 # destroyed
787 stack.callback(file.detach)
788 else:
789 # This is to handle passed objects that aren't in the
790 # IOBase hierarchy, but just have a write method
791 file = io.BufferedIOBase()
792 file.writable = lambda: True
793 file.write = write
794 try:
795 # TextIOWrapper uses this methods to determine
796 # if BOM (for UTF-16, etc) should be added
797 file.seekable = file_or_filename.seekable
798 file.tell = file_or_filename.tell
799 except AttributeError:
800 pass
801 file = io.TextIOWrapper(file,
802 encoding=encoding,
803 errors="xmlcharrefreplace",
804 newline="\n")
805 # Keep the original file open when the TextIOWrapper is
806 # destroyed
807 stack.callback(file.detach)
808 yield file.write
810def _namespaces(elem, default_namespace=None):
811 # identify namespaces used in this tree
813 # maps qnames to *encoded* prefix:local names
814 qnames = {None: None}
816 # maps uri:s to prefixes
817 namespaces = {}
818 if default_namespace:
819 namespaces[default_namespace] = ""
821 def add_qname(qname):
822 # calculate serialized qname representation
823 try:
824 if qname[:1] == "{":
825 uri, tag = qname[1:].rsplit("}", 1)
826 prefix = namespaces.get(uri)
827 if prefix is None:
828 prefix = _namespace_map.get(uri)
829 if prefix is None:
830 prefix = "ns%d" % len(namespaces)
831 if prefix != "xml":
832 namespaces[uri] = prefix
833 if prefix:
834 qnames[qname] = "%s:%s" % (prefix, tag)
835 else:
836 qnames[qname] = tag # default element
837 else:
838 if default_namespace:
839 # FIXME: can this be handled in XML 1.0?
840 raise ValueError(
841 "cannot use non-qualified names with "
842 "default_namespace option"
843 )
844 qnames[qname] = qname
845 except TypeError:
846 _raise_serialization_error(qname)
848 # populate qname and namespaces table
849 for elem in elem.iter():
850 tag = elem.tag
851 if isinstance(tag, QName):
852 if tag.text not in qnames:
853 add_qname(tag.text)
854 elif isinstance(tag, str):
855 if tag not in qnames:
856 add_qname(tag)
857 elif tag is not None and tag is not Comment and tag is not PI:
858 _raise_serialization_error(tag)
859 for key, value in elem.items():
860 if isinstance(key, QName):
861 key = key.text
862 if key not in qnames:
863 add_qname(key)
864 if isinstance(value, QName) and value.text not in qnames:
865 add_qname(value.text)
866 text = elem.text
867 if isinstance(text, QName) and text.text not in qnames:
868 add_qname(text.text)
869 return qnames, namespaces
871def _serialize_xml(write, elem, qnames, namespaces,
872 short_empty_elements, **kwargs):
873 tag = elem.tag
874 text = elem.text
875 if tag is Comment:
876 write("<!--%s-->" % text)
877 elif tag is ProcessingInstruction:
878 write("<?%s?>" % text)
879 else:
880 tag = qnames[tag]
881 if tag is None:
882 if text:
883 write(_escape_cdata(text))
884 for e in elem:
885 _serialize_xml(write, e, qnames, None,
886 short_empty_elements=short_empty_elements)
887 else:
888 write("<" + tag)
889 items = list(elem.items())
890 if items or namespaces:
891 if namespaces:
892 for v, k in sorted(namespaces.items(),
893 key=lambda x: x[1]): # sort on prefix
894 if k:
895 k = ":" + k
896 write(" xmlns%s=\"%s\"" % (
897 k,
898 _escape_attrib(v)
899 ))
900 for k, v in items:
901 if isinstance(k, QName):
902 k = k.text
903 if isinstance(v, QName):
904 v = qnames[v.text]
905 else:
906 v = _escape_attrib(v)
907 write(" %s=\"%s\"" % (qnames[k], v))
908 if text or len(elem) or not short_empty_elements:
909 write(">")
910 if text:
911 write(_escape_cdata(text))
912 for e in elem:
913 _serialize_xml(write, e, qnames, None,
914 short_empty_elements=short_empty_elements)
915 write("</" + tag + ">")
916 else:
917 write(" />")
918 if elem.tail:
919 write(_escape_cdata(elem.tail))
921HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
922 "img", "input", "isindex", "link", "meta", "param")
924try:
925 HTML_EMPTY = set(HTML_EMPTY)
926except NameError:
927 pass
929def _serialize_html(write, elem, qnames, namespaces, **kwargs):
930 tag = elem.tag
931 text = elem.text
932 if tag is Comment:
933 write("<!--%s-->" % _escape_cdata(text))
934 elif tag is ProcessingInstruction:
935 write("<?%s?>" % _escape_cdata(text))
936 else:
937 tag = qnames[tag]
938 if tag is None:
939 if text:
940 write(_escape_cdata(text))
941 for e in elem:
942 _serialize_html(write, e, qnames, None)
943 else:
944 write("<" + tag)
945 items = list(elem.items())
946 if items or namespaces:
947 if namespaces:
948 for v, k in sorted(namespaces.items(),
949 key=lambda x: x[1]): # sort on prefix
950 if k:
951 k = ":" + k
952 write(" xmlns%s=\"%s\"" % (
953 k,
954 _escape_attrib(v)
955 ))
956 for k, v in items:
957 if isinstance(k, QName):
958 k = k.text
959 if isinstance(v, QName):
960 v = qnames[v.text]
961 else:
962 v = _escape_attrib_html(v)
963 # FIXME: handle boolean attributes
964 write(" %s=\"%s\"" % (qnames[k], v))
965 write(">")
966 ltag = tag.lower()
967 if text:
968 if ltag == "script" or ltag == "style":
969 write(text)
970 else:
971 write(_escape_cdata(text))
972 for e in elem:
973 _serialize_html(write, e, qnames, None)
974 if ltag not in HTML_EMPTY:
975 write("</" + tag + ">")
976 if elem.tail:
977 write(_escape_cdata(elem.tail))
979def _serialize_text(write, elem):
980 for part in elem.itertext():
981 write(part)
982 if elem.tail:
983 write(elem.tail)
985_serialize = {
986 "xml": _serialize_xml,
987 "html": _serialize_html,
988 "text": _serialize_text,
989# this optional method is imported at the end of the module
990# "c14n": _serialize_c14n,
991}
994def register_namespace(prefix, uri):
995 """Register a namespace prefix.
997 The registry is global, and any existing mapping for either the
998 given prefix or the namespace URI will be removed.
1000 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1001 attributes in this namespace will be serialized with prefix if possible.
1003 ValueError is raised if prefix is reserved or is invalid.
1005 """
1006 if re.match(r"ns\d+$", prefix):
1007 raise ValueError("Prefix format reserved for internal use")
1008 for k, v in list(_namespace_map.items()):
1009 if k == uri or v == prefix:
1010 del _namespace_map[k]
1011 _namespace_map[uri] = prefix
1013_namespace_map = {
1014 # "well-known" namespace prefixes
1015 "http://www.w3.org/XML/1998/namespace": "xml",
1016 "http://www.w3.org/1999/xhtml": "html",
1017 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1018 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1019 # xml schema
1020 "http://www.w3.org/2001/XMLSchema": "xs",
1021 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1022 # dublin core
1023 "http://purl.org/dc/elements/1.1/": "dc",
1024}
1025# For tests and troubleshooting
1026register_namespace._namespace_map = _namespace_map
1028def _raise_serialization_error(text):
1029 raise TypeError(
1030 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1031 )
1033def _escape_cdata(text):
1034 # escape character data
1035 try:
1036 # it's worth avoiding do-nothing calls for strings that are
1037 # shorter than 500 characters, or so. assume that's, by far,
1038 # the most common case in most applications.
1039 if "&" in text:
1040 text = text.replace("&", "&")
1041 if "<" in text:
1042 text = text.replace("<", "<")
1043 if ">" in text:
1044 text = text.replace(">", ">")
1045 return text
1046 except (TypeError, AttributeError):
1047 _raise_serialization_error(text)
1049def _escape_attrib(text):
1050 # escape attribute value
1051 try:
1052 if "&" in text:
1053 text = text.replace("&", "&")
1054 if "<" in text:
1055 text = text.replace("<", "<")
1056 if ">" in text:
1057 text = text.replace(">", ">")
1058 if "\"" in text:
1059 text = text.replace("\"", """)
1060 # Although section 2.11 of the XML specification states that CR or
1061 # CR LN should be replaced with just LN, it applies only to EOLNs
1062 # which take part of organizing file into lines. Within attributes,
1063 # we are replacing these with entity numbers, so they do not count.
1064 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1065 # The current solution, contained in following six lines, was
1066 # discussed in issue 17582 and 39011.
1067 if "\r" in text:
1068 text = text.replace("\r", " ")
1069 if "\n" in text:
1070 text = text.replace("\n", " ")
1071 if "\t" in text:
1072 text = text.replace("\t", "	")
1073 return text
1074 except (TypeError, AttributeError):
1075 _raise_serialization_error(text)
1077def _escape_attrib_html(text):
1078 # escape attribute value
1079 try:
1080 if "&" in text:
1081 text = text.replace("&", "&")
1082 if ">" in text:
1083 text = text.replace(">", ">")
1084 if "\"" in text:
1085 text = text.replace("\"", """)
1086 return text
1087 except (TypeError, AttributeError):
1088 _raise_serialization_error(text)
1090# --------------------------------------------------------------------
1092def tostring(element, encoding=None, method=None, *,
1093 xml_declaration=None, default_namespace=None,
1094 short_empty_elements=True):
1095 """Generate string representation of XML element.
1097 All subelements are included. If encoding is "unicode", a string
1098 is returned. Otherwise a bytestring is returned.
1100 *element* is an Element instance, *encoding* is an optional output
1101 encoding defaulting to US-ASCII, *method* is an optional output which can
1102 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1103 sets the default XML namespace (for "xmlns").
1105 Returns an (optionally) encoded string containing the XML data.
1107 """
1108 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1109 ElementTree(element).write(stream, encoding,
1110 xml_declaration=xml_declaration,
1111 default_namespace=default_namespace,
1112 method=method,
1113 short_empty_elements=short_empty_elements)
1114 return stream.getvalue()
1116class _ListDataStream(io.BufferedIOBase):
1117 """An auxiliary stream accumulating into a list reference."""
1118 def __init__(self, lst):
1119 self.lst = lst
1121 def writable(self):
1122 return True
1124 def seekable(self):
1125 return True
1127 def write(self, b):
1128 self.lst.append(b)
1130 def tell(self):
1131 return len(self.lst)
1133def tostringlist(element, encoding=None, method=None, *,
1134 xml_declaration=None, default_namespace=None,
1135 short_empty_elements=True):
1136 lst = []
1137 stream = _ListDataStream(lst)
1138 ElementTree(element).write(stream, encoding,
1139 xml_declaration=xml_declaration,
1140 default_namespace=default_namespace,
1141 method=method,
1142 short_empty_elements=short_empty_elements)
1143 return lst
1146def dump(elem):
1147 """Write element tree or element structure to sys.stdout.
1149 This function should be used for debugging only.
1151 *elem* is either an ElementTree, or a single Element. The exact output
1152 format is implementation dependent. In this version, it's written as an
1153 ordinary XML file.
1155 """
1156 # debugging
1157 if not isinstance(elem, ElementTree):
1158 elem = ElementTree(elem)
1159 elem.write(sys.stdout, encoding="unicode")
1160 tail = elem.getroot().tail
1161 if not tail or tail[-1] != "\n":
1162 sys.stdout.write("\n")
1165def indent(tree, space=" ", level=0):
1166 """Indent an XML document by inserting newlines and indentation space
1167 after elements.
1169 *tree* is the ElementTree or Element to modify. The (root) element
1170 itself will not be changed, but the tail text of all elements in its
1171 subtree will be adapted.
1173 *space* is the whitespace to insert for each indentation level, two
1174 space characters by default.
1176 *level* is the initial indentation level. Setting this to a higher
1177 value than 0 can be used for indenting subtrees that are more deeply
1178 nested inside of a document.
1179 """
1180 if isinstance(tree, ElementTree):
1181 tree = tree.getroot()
1182 if level < 0:
1183 raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1184 if not len(tree):
1185 return
1187 # Reduce the memory consumption by reusing indentation strings.
1188 indentations = ["\n" + level * space]
1190 def _indent_children(elem, level):
1191 # Start a new indentation level for the first child.
1192 child_level = level + 1
1193 try:
1194 child_indentation = indentations[child_level]
1195 except IndexError:
1196 child_indentation = indentations[level] + space
1197 indentations.append(child_indentation)
1199 if not elem.text or not elem.text.strip():
1200 elem.text = child_indentation
1202 for child in elem:
1203 if len(child):
1204 _indent_children(child, child_level)
1205 if not child.tail or not child.tail.strip():
1206 child.tail = child_indentation
1208 # Dedent after the last child by overwriting the previous indentation.
1209 if not child.tail.strip():
1210 child.tail = indentations[level]
1212 _indent_children(tree, 0)
1215# --------------------------------------------------------------------
1216# parsing
1219def parse(source, parser=None):
1220 """Parse XML document into element tree.
1222 *source* is a filename or file object containing XML data,
1223 *parser* is an optional parser instance defaulting to XMLParser.
1225 Return an ElementTree instance.
1227 """
1228 tree = ElementTree()
1229 tree.parse(source, parser)
1230 return tree
1233def iterparse(source, events=None, parser=None):
1234 """Incrementally parse XML document into ElementTree.
1236 This class also reports what's going on to the user based on the
1237 *events* it is initialized with. The supported events are the strings
1238 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1239 detailed namespace information). If *events* is omitted, only
1240 "end" events are reported.
1242 *source* is a filename or file object containing XML data, *events* is
1243 a list of events to report back, *parser* is an optional parser instance.
1245 Returns an iterator providing (event, elem) pairs.
1247 """
1248 # Use the internal, undocumented _parser argument for now; When the
1249 # parser argument of iterparse is removed, this can be killed.
1250 pullparser = XMLPullParser(events=events, _parser=parser)
1251 def iterator():
1252 try:
1253 while True:
1254 yield from pullparser.read_events()
1255 # load event buffer
1256 data = source.read(16 * 1024)
1257 if not data:
1258 break
1259 pullparser.feed(data)
1260 root = pullparser._close_and_return_root()
1261 yield from pullparser.read_events()
1262 it.root = root
1263 finally:
1264 if close_source:
1265 source.close()
1267 class IterParseIterator(collections.abc.Iterator):
1268 __next__ = iterator().__next__
1269 it = IterParseIterator()
1270 it.root = None
1271 del iterator, IterParseIterator
1273 close_source = False
1274 if not hasattr(source, "read"):
1275 source = open(source, "rb")
1276 close_source = True
1278 return it
1281class XMLPullParser:
1283 def __init__(self, events=None, *, _parser=None):
1284 # The _parser argument is for internal use only and must not be relied
1285 # upon in user code. It will be removed in a future release.
1286 # See http://bugs.python.org/issue17741 for more details.
1288 self._events_queue = collections.deque()
1289 self._parser = _parser or XMLParser(target=TreeBuilder())
1290 # wire up the parser for event reporting
1291 if events is None:
1292 events = ("end",)
1293 self._parser._setevents(self._events_queue, events)
1295 def feed(self, data):
1296 """Feed encoded data to parser."""
1297 if self._parser is None:
1298 raise ValueError("feed() called after end of stream")
1299 if data:
1300 try:
1301 self._parser.feed(data)
1302 except SyntaxError as exc:
1303 self._events_queue.append(exc)
1305 def _close_and_return_root(self):
1306 # iterparse needs this to set its root attribute properly :(
1307 root = self._parser.close()
1308 self._parser = None
1309 return root
1311 def close(self):
1312 """Finish feeding data to parser.
1314 Unlike XMLParser, does not return the root element. Use
1315 read_events() to consume elements from XMLPullParser.
1316 """
1317 self._close_and_return_root()
1319 def read_events(self):
1320 """Return an iterator over currently available (event, elem) pairs.
1322 Events are consumed from the internal event queue as they are
1323 retrieved from the iterator.
1324 """
1325 events = self._events_queue
1326 while events:
1327 event = events.popleft()
1328 if isinstance(event, Exception):
1329 raise event
1330 else:
1331 yield event
1334def XML(text, parser=None):
1335 """Parse XML document from string constant.
1337 This function can be used to embed "XML Literals" in Python code.
1339 *text* is a string containing XML data, *parser* is an
1340 optional parser instance, defaulting to the standard XMLParser.
1342 Returns an Element instance.
1344 """
1345 if not parser:
1346 parser = XMLParser(target=TreeBuilder())
1347 parser.feed(text)
1348 return parser.close()
1351def XMLID(text, parser=None):
1352 """Parse XML document from string constant for its IDs.
1354 *text* is a string containing XML data, *parser* is an
1355 optional parser instance, defaulting to the standard XMLParser.
1357 Returns an (Element, dict) tuple, in which the
1358 dict maps element id:s to elements.
1360 """
1361 if not parser:
1362 parser = XMLParser(target=TreeBuilder())
1363 parser.feed(text)
1364 tree = parser.close()
1365 ids = {}
1366 for elem in tree.iter():
1367 id = elem.get("id")
1368 if id:
1369 ids[id] = elem
1370 return tree, ids
1372# Parse XML document from string constant. Alias for XML().
1373fromstring = XML
1375def fromstringlist(sequence, parser=None):
1376 """Parse XML document from sequence of string fragments.
1378 *sequence* is a list of other sequence, *parser* is an optional parser
1379 instance, defaulting to the standard XMLParser.
1381 Returns an Element instance.
1383 """
1384 if not parser:
1385 parser = XMLParser(target=TreeBuilder())
1386 for text in sequence:
1387 parser.feed(text)
1388 return parser.close()
1390# --------------------------------------------------------------------
1393class TreeBuilder:
1394 """Generic element structure builder.
1396 This builder converts a sequence of start, data, and end method
1397 calls to a well-formed element structure.
1399 You can use this class to build an element structure using a custom XML
1400 parser, or a parser for some other XML-like format.
1402 *element_factory* is an optional element factory which is called
1403 to create new Element instances, as necessary.
1405 *comment_factory* is a factory to create comments to be used instead of
1406 the standard factory. If *insert_comments* is false (the default),
1407 comments will not be inserted into the tree.
1409 *pi_factory* is a factory to create processing instructions to be used
1410 instead of the standard factory. If *insert_pis* is false (the default),
1411 processing instructions will not be inserted into the tree.
1412 """
1413 def __init__(self, element_factory=None, *,
1414 comment_factory=None, pi_factory=None,
1415 insert_comments=False, insert_pis=False):
1416 self._data = [] # data collector
1417 self._elem = [] # element stack
1418 self._last = None # last element
1419 self._root = None # root element
1420 self._tail = None # true if we're after an end tag
1421 if comment_factory is None:
1422 comment_factory = Comment
1423 self._comment_factory = comment_factory
1424 self.insert_comments = insert_comments
1425 if pi_factory is None:
1426 pi_factory = ProcessingInstruction
1427 self._pi_factory = pi_factory
1428 self.insert_pis = insert_pis
1429 if element_factory is None:
1430 element_factory = Element
1431 self._factory = element_factory
1433 def close(self):
1434 """Flush builder buffers and return toplevel document Element."""
1435 assert len(self._elem) == 0, "missing end tags"
1436 assert self._root is not None, "missing toplevel element"
1437 return self._root
1439 def _flush(self):
1440 if self._data:
1441 if self._last is not None:
1442 text = "".join(self._data)
1443 if self._tail:
1444 assert self._last.tail is None, "internal error (tail)"
1445 self._last.tail = text
1446 else:
1447 assert self._last.text is None, "internal error (text)"
1448 self._last.text = text
1449 self._data = []
1451 def data(self, data):
1452 """Add text to current element."""
1453 self._data.append(data)
1455 def start(self, tag, attrs):
1456 """Open new element and return it.
1458 *tag* is the element name, *attrs* is a dict containing element
1459 attributes.
1461 """
1462 self._flush()
1463 self._last = elem = self._factory(tag, attrs)
1464 if self._elem:
1465 self._elem[-1].append(elem)
1466 elif self._root is None:
1467 self._root = elem
1468 self._elem.append(elem)
1469 self._tail = 0
1470 return elem
1472 def end(self, tag):
1473 """Close and return current Element.
1475 *tag* is the element name.
1477 """
1478 self._flush()
1479 self._last = self._elem.pop()
1480 assert self._last.tag == tag,\
1481 "end tag mismatch (expected %s, got %s)" % (
1482 self._last.tag, tag)
1483 self._tail = 1
1484 return self._last
1486 def comment(self, text):
1487 """Create a comment using the comment_factory.
1489 *text* is the text of the comment.
1490 """
1491 return self._handle_single(
1492 self._comment_factory, self.insert_comments, text)
1494 def pi(self, target, text=None):
1495 """Create a processing instruction using the pi_factory.
1497 *target* is the target name of the processing instruction.
1498 *text* is the data of the processing instruction, or ''.
1499 """
1500 return self._handle_single(
1501 self._pi_factory, self.insert_pis, target, text)
1503 def _handle_single(self, factory, insert, *args):
1504 elem = factory(*args)
1505 if insert:
1506 self._flush()
1507 self._last = elem
1508 if self._elem:
1509 self._elem[-1].append(elem)
1510 self._tail = 1
1511 return elem
1514# also see ElementTree and TreeBuilder
1515class XMLParser:
1516 """Element structure builder for XML source data based on the expat parser.
1518 *target* is an optional target object which defaults to an instance of the
1519 standard TreeBuilder class, *encoding* is an optional encoding string
1520 which if given, overrides the encoding specified in the XML file:
1521 http://www.iana.org/assignments/character-sets
1523 """
1525 def __init__(self, *, target=None, encoding=None):
1526 try:
1527 from xml.parsers import expat
1528 except ImportError:
1529 try:
1530 import pyexpat as expat
1531 except ImportError:
1532 raise ImportError(
1533 "No module named expat; use SimpleXMLTreeBuilder instead"
1534 )
1535 parser = expat.ParserCreate(encoding, "}")
1536 if target is None:
1537 target = TreeBuilder()
1538 # underscored names are provided for compatibility only
1539 self.parser = self._parser = parser
1540 self.target = self._target = target
1541 self._error = expat.error
1542 self._names = {} # name memo cache
1543 # main callbacks
1544 parser.DefaultHandlerExpand = self._default
1545 if hasattr(target, 'start'):
1546 parser.StartElementHandler = self._start
1547 if hasattr(target, 'end'):
1548 parser.EndElementHandler = self._end
1549 if hasattr(target, 'start_ns'):
1550 parser.StartNamespaceDeclHandler = self._start_ns
1551 if hasattr(target, 'end_ns'):
1552 parser.EndNamespaceDeclHandler = self._end_ns
1553 if hasattr(target, 'data'):
1554 parser.CharacterDataHandler = target.data
1555 # miscellaneous callbacks
1556 if hasattr(target, 'comment'):
1557 parser.CommentHandler = target.comment
1558 if hasattr(target, 'pi'):
1559 parser.ProcessingInstructionHandler = target.pi
1560 # Configure pyexpat: buffering, new-style attribute handling.
1561 parser.buffer_text = 1
1562 parser.ordered_attributes = 1
1563 parser.specified_attributes = 1
1564 self._doctype = None
1565 self.entity = {}
1566 try:
1567 self.version = "Expat %d.%d.%d" % expat.version_info
1568 except AttributeError:
1569 pass # unknown
1571 def _setevents(self, events_queue, events_to_report):
1572 # Internal API for XMLPullParser
1573 # events_to_report: a list of events to report during parsing (same as
1574 # the *events* of XMLPullParser's constructor.
1575 # events_queue: a list of actual parsing events that will be populated
1576 # by the underlying parser.
1577 #
1578 parser = self._parser
1579 append = events_queue.append
1580 for event_name in events_to_report:
1581 if event_name == "start":
1582 parser.ordered_attributes = 1
1583 parser.specified_attributes = 1
1584 def handler(tag, attrib_in, event=event_name, append=append,
1585 start=self._start):
1586 append((event, start(tag, attrib_in)))
1587 parser.StartElementHandler = handler
1588 elif event_name == "end":
1589 def handler(tag, event=event_name, append=append,
1590 end=self._end):
1591 append((event, end(tag)))
1592 parser.EndElementHandler = handler
1593 elif event_name == "start-ns":
1594 # TreeBuilder does not implement .start_ns()
1595 if hasattr(self.target, "start_ns"):
1596 def handler(prefix, uri, event=event_name, append=append,
1597 start_ns=self._start_ns):
1598 append((event, start_ns(prefix, uri)))
1599 else:
1600 def handler(prefix, uri, event=event_name, append=append):
1601 append((event, (prefix or '', uri or '')))
1602 parser.StartNamespaceDeclHandler = handler
1603 elif event_name == "end-ns":
1604 # TreeBuilder does not implement .end_ns()
1605 if hasattr(self.target, "end_ns"):
1606 def handler(prefix, event=event_name, append=append,
1607 end_ns=self._end_ns):
1608 append((event, end_ns(prefix)))
1609 else:
1610 def handler(prefix, event=event_name, append=append):
1611 append((event, None))
1612 parser.EndNamespaceDeclHandler = handler
1613 elif event_name == 'comment':
1614 def handler(text, event=event_name, append=append, self=self):
1615 append((event, self.target.comment(text)))
1616 parser.CommentHandler = handler
1617 elif event_name == 'pi':
1618 def handler(pi_target, data, event=event_name, append=append,
1619 self=self):
1620 append((event, self.target.pi(pi_target, data)))
1621 parser.ProcessingInstructionHandler = handler
1622 else:
1623 raise ValueError("unknown event %r" % event_name)
1625 def _raiseerror(self, value):
1626 err = ParseError(value)
1627 err.code = value.code
1628 err.position = value.lineno, value.offset
1629 raise err
1631 def _fixname(self, key):
1632 # expand qname, and convert name string to ascii, if possible
1633 try:
1634 name = self._names[key]
1635 except KeyError:
1636 name = key
1637 if "}" in name:
1638 name = "{" + name
1639 self._names[key] = name
1640 return name
1642 def _start_ns(self, prefix, uri):
1643 return self.target.start_ns(prefix or '', uri or '')
1645 def _end_ns(self, prefix):
1646 return self.target.end_ns(prefix or '')
1648 def _start(self, tag, attr_list):
1649 # Handler for expat's StartElementHandler. Since ordered_attributes
1650 # is set, the attributes are reported as a list of alternating
1651 # attribute name,value.
1652 fixname = self._fixname
1653 tag = fixname(tag)
1654 attrib = {}
1655 if attr_list:
1656 for i in range(0, len(attr_list), 2):
1657 attrib[fixname(attr_list[i])] = attr_list[i+1]
1658 return self.target.start(tag, attrib)
1660 def _end(self, tag):
1661 return self.target.end(self._fixname(tag))
1663 def _default(self, text):
1664 prefix = text[:1]
1665 if prefix == "&":
1666 # deal with undefined entities
1667 try:
1668 data_handler = self.target.data
1669 except AttributeError:
1670 return
1671 try:
1672 data_handler(self.entity[text[1:-1]])
1673 except KeyError:
1674 from xml.parsers import expat
1675 err = expat.error(
1676 "undefined entity %s: line %d, column %d" %
1677 (text, self.parser.ErrorLineNumber,
1678 self.parser.ErrorColumnNumber)
1679 )
1680 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1681 err.lineno = self.parser.ErrorLineNumber
1682 err.offset = self.parser.ErrorColumnNumber
1683 raise err
1684 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1685 self._doctype = [] # inside a doctype declaration
1686 elif self._doctype is not None:
1687 # parse doctype contents
1688 if prefix == ">":
1689 self._doctype = None
1690 return
1691 text = text.strip()
1692 if not text:
1693 return
1694 self._doctype.append(text)
1695 n = len(self._doctype)
1696 if n > 2:
1697 type = self._doctype[1]
1698 if type == "PUBLIC" and n == 4:
1699 name, type, pubid, system = self._doctype
1700 if pubid:
1701 pubid = pubid[1:-1]
1702 elif type == "SYSTEM" and n == 3:
1703 name, type, system = self._doctype
1704 pubid = None
1705 else:
1706 return
1707 if hasattr(self.target, "doctype"):
1708 self.target.doctype(name, pubid, system[1:-1])
1709 elif hasattr(self, "doctype"):
1710 warnings.warn(
1711 "The doctype() method of XMLParser is ignored. "
1712 "Define doctype() method on the TreeBuilder target.",
1713 RuntimeWarning)
1715 self._doctype = None
1717 def feed(self, data):
1718 """Feed encoded data to parser."""
1719 try:
1720 self.parser.Parse(data, False)
1721 except self._error as v:
1722 self._raiseerror(v)
1724 def close(self):
1725 """Finish feeding data to parser and return element structure."""
1726 try:
1727 self.parser.Parse(b"", True) # end of data
1728 except self._error as v:
1729 self._raiseerror(v)
1730 try:
1731 close_handler = self.target.close
1732 except AttributeError:
1733 pass
1734 else:
1735 return close_handler()
1736 finally:
1737 # get rid of circular references
1738 del self.parser, self._parser
1739 del self.target, self._target
1742# --------------------------------------------------------------------
1743# C14N 2.0
1745def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1746 """Convert XML to its C14N 2.0 serialised form.
1748 If *out* is provided, it must be a file or file-like object that receives
1749 the serialised canonical XML output (text, not bytes) through its ``.write()``
1750 method. To write to a file, open it in text mode with encoding "utf-8".
1751 If *out* is not provided, this function returns the output as text string.
1753 Either *xml_data* (an XML string) or *from_file* (a file path or
1754 file-like object) must be provided as input.
1756 The configuration options are the same as for the ``C14NWriterTarget``.
1757 """
1758 if xml_data is None and from_file is None:
1759 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1760 sio = None
1761 if out is None:
1762 sio = out = io.StringIO()
1764 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1766 if xml_data is not None:
1767 parser.feed(xml_data)
1768 parser.close()
1769 elif from_file is not None:
1770 parse(from_file, parser=parser)
1772 return sio.getvalue() if sio is not None else None
1775_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1778class C14NWriterTarget:
1779 """
1780 Canonicalization writer target for the XMLParser.
1782 Serialises parse events to XML C14N 2.0.
1784 The *write* function is used for writing out the resulting data stream
1785 as text (not bytes). To write to a file, open it in text mode with encoding
1786 "utf-8" and pass its ``.write`` method.
1788 Configuration options:
1790 - *with_comments*: set to true to include comments
1791 - *strip_text*: set to true to strip whitespace before and after text content
1792 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1793 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1794 should be replaced in text content
1795 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1796 should be replaced in text content
1797 - *exclude_attrs*: a set of attribute names that should not be serialised
1798 - *exclude_tags*: a set of tag names that should not be serialised
1799 """
1800 def __init__(self, write, *,
1801 with_comments=False, strip_text=False, rewrite_prefixes=False,
1802 qname_aware_tags=None, qname_aware_attrs=None,
1803 exclude_attrs=None, exclude_tags=None):
1804 self._write = write
1805 self._data = []
1806 self._with_comments = with_comments
1807 self._strip_text = strip_text
1808 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1809 self._exclude_tags = set(exclude_tags) if exclude_tags else None
1811 self._rewrite_prefixes = rewrite_prefixes
1812 if qname_aware_tags:
1813 self._qname_aware_tags = set(qname_aware_tags)
1814 else:
1815 self._qname_aware_tags = None
1816 if qname_aware_attrs:
1817 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1818 else:
1819 self._find_qname_aware_attrs = None
1821 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1822 self._declared_ns_stack = [[
1823 ("http://www.w3.org/XML/1998/namespace", "xml"),
1824 ]]
1825 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1826 self._ns_stack = []
1827 if not rewrite_prefixes:
1828 self._ns_stack.append(list(_namespace_map.items()))
1829 self._ns_stack.append([])
1830 self._prefix_map = {}
1831 self._preserve_space = [False]
1832 self._pending_start = None
1833 self._root_seen = False
1834 self._root_done = False
1835 self._ignored_depth = 0
1837 def _iter_namespaces(self, ns_stack, _reversed=reversed):
1838 for namespaces in _reversed(ns_stack):
1839 if namespaces: # almost no element declares new namespaces
1840 yield from namespaces
1842 def _resolve_prefix_name(self, prefixed_name):
1843 prefix, name = prefixed_name.split(':', 1)
1844 for uri, p in self._iter_namespaces(self._ns_stack):
1845 if p == prefix:
1846 return f'{{{uri}}}{name}'
1847 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1849 def _qname(self, qname, uri=None):
1850 if uri is None:
1851 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1852 else:
1853 tag = qname
1855 prefixes_seen = set()
1856 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1857 if u == uri and prefix not in prefixes_seen:
1858 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1859 prefixes_seen.add(prefix)
1861 # Not declared yet => add new declaration.
1862 if self._rewrite_prefixes:
1863 if uri in self._prefix_map:
1864 prefix = self._prefix_map[uri]
1865 else:
1866 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1867 self._declared_ns_stack[-1].append((uri, prefix))
1868 return f'{prefix}:{tag}', tag, uri
1870 if not uri and '' not in prefixes_seen:
1871 # No default namespace declared => no prefix needed.
1872 return tag, tag, uri
1874 for u, prefix in self._iter_namespaces(self._ns_stack):
1875 if u == uri:
1876 self._declared_ns_stack[-1].append((uri, prefix))
1877 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1879 if not uri:
1880 # As soon as a default namespace is defined,
1881 # anything that has no namespace (and thus, no prefix) goes there.
1882 return tag, tag, uri
1884 raise ValueError(f'Namespace "{uri}" is not declared in scope')
1886 def data(self, data):
1887 if not self._ignored_depth:
1888 self._data.append(data)
1890 def _flush(self, _join_text=''.join):
1891 data = _join_text(self._data)
1892 del self._data[:]
1893 if self._strip_text and not self._preserve_space[-1]:
1894 data = data.strip()
1895 if self._pending_start is not None:
1896 args, self._pending_start = self._pending_start, None
1897 qname_text = data if data and _looks_like_prefix_name(data) else None
1898 self._start(*args, qname_text)
1899 if qname_text is not None:
1900 return
1901 if data and self._root_seen:
1902 self._write(_escape_cdata_c14n(data))
1904 def start_ns(self, prefix, uri):
1905 if self._ignored_depth:
1906 return
1907 # we may have to resolve qnames in text content
1908 if self._data:
1909 self._flush()
1910 self._ns_stack[-1].append((uri, prefix))
1912 def start(self, tag, attrs):
1913 if self._exclude_tags is not None and (
1914 self._ignored_depth or tag in self._exclude_tags):
1915 self._ignored_depth += 1
1916 return
1917 if self._data:
1918 self._flush()
1920 new_namespaces = []
1921 self._declared_ns_stack.append(new_namespaces)
1923 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1924 # Need to parse text first to see if it requires a prefix declaration.
1925 self._pending_start = (tag, attrs, new_namespaces)
1926 return
1927 self._start(tag, attrs, new_namespaces)
1929 def _start(self, tag, attrs, new_namespaces, qname_text=None):
1930 if self._exclude_attrs is not None and attrs:
1931 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1933 qnames = {tag, *attrs}
1934 resolved_names = {}
1936 # Resolve prefixes in attribute and tag text.
1937 if qname_text is not None:
1938 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1939 qnames.add(qname)
1940 if self._find_qname_aware_attrs is not None and attrs:
1941 qattrs = self._find_qname_aware_attrs(attrs)
1942 if qattrs:
1943 for attr_name in qattrs:
1944 value = attrs[attr_name]
1945 if _looks_like_prefix_name(value):
1946 qname = resolved_names[value] = self._resolve_prefix_name(value)
1947 qnames.add(qname)
1948 else:
1949 qattrs = None
1950 else:
1951 qattrs = None
1953 # Assign prefixes in lexicographical order of used URIs.
1954 parse_qname = self._qname
1955 parsed_qnames = {n: parse_qname(n) for n in sorted(
1956 qnames, key=lambda n: n.split('}', 1))}
1958 # Write namespace declarations in prefix order ...
1959 if new_namespaces:
1960 attr_list = [
1961 ('xmlns:' + prefix if prefix else 'xmlns', uri)
1962 for uri, prefix in new_namespaces
1963 ]
1964 attr_list.sort()
1965 else:
1966 # almost always empty
1967 attr_list = []
1969 # ... followed by attributes in URI+name order
1970 if attrs:
1971 for k, v in sorted(attrs.items()):
1972 if qattrs is not None and k in qattrs and v in resolved_names:
1973 v = parsed_qnames[resolved_names[v]][0]
1974 attr_qname, attr_name, uri = parsed_qnames[k]
1975 # No prefix for attributes in default ('') namespace.
1976 attr_list.append((attr_qname if uri else attr_name, v))
1978 # Honour xml:space attributes.
1979 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1980 self._preserve_space.append(
1981 space_behaviour == 'preserve' if space_behaviour
1982 else self._preserve_space[-1])
1984 # Write the tag.
1985 write = self._write
1986 write('<' + parsed_qnames[tag][0])
1987 if attr_list:
1988 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1989 write('>')
1991 # Write the resolved qname text content.
1992 if qname_text is not None:
1993 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1995 self._root_seen = True
1996 self._ns_stack.append([])
1998 def end(self, tag):
1999 if self._ignored_depth:
2000 self._ignored_depth -= 1
2001 return
2002 if self._data:
2003 self._flush()
2004 self._write(f'</{self._qname(tag)[0]}>')
2005 self._preserve_space.pop()
2006 self._root_done = len(self._preserve_space) == 1
2007 self._declared_ns_stack.pop()
2008 self._ns_stack.pop()
2010 def comment(self, text):
2011 if not self._with_comments:
2012 return
2013 if self._ignored_depth:
2014 return
2015 if self._root_done:
2016 self._write('\n')
2017 elif self._root_seen and self._data:
2018 self._flush()
2019 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2020 if not self._root_seen:
2021 self._write('\n')
2023 def pi(self, target, data):
2024 if self._ignored_depth:
2025 return
2026 if self._root_done:
2027 self._write('\n')
2028 elif self._root_seen and self._data:
2029 self._flush()
2030 self._write(
2031 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2032 if not self._root_seen:
2033 self._write('\n')
2036def _escape_cdata_c14n(text):
2037 # escape character data
2038 try:
2039 # it's worth avoiding do-nothing calls for strings that are
2040 # shorter than 500 character, or so. assume that's, by far,
2041 # the most common case in most applications.
2042 if '&' in text:
2043 text = text.replace('&', '&')
2044 if '<' in text:
2045 text = text.replace('<', '<')
2046 if '>' in text:
2047 text = text.replace('>', '>')
2048 if '\r' in text:
2049 text = text.replace('\r', '
')
2050 return text
2051 except (TypeError, AttributeError):
2052 _raise_serialization_error(text)
2055def _escape_attrib_c14n(text):
2056 # escape attribute value
2057 try:
2058 if '&' in text:
2059 text = text.replace('&', '&')
2060 if '<' in text:
2061 text = text.replace('<', '<')
2062 if '"' in text:
2063 text = text.replace('"', '"')
2064 if '\t' in text:
2065 text = text.replace('\t', '	')
2066 if '\n' in text:
2067 text = text.replace('\n', '
')
2068 if '\r' in text:
2069 text = text.replace('\r', '
')
2070 return text
2071 except (TypeError, AttributeError):
2072 _raise_serialization_error(text)
2075# --------------------------------------------------------------------
2077# Import the C accelerators
2078try:
2079 # Element is going to be shadowed by the C implementation. We need to keep
2080 # the Python version of it accessible for some "creative" by external code
2081 # (see tests)
2082 _Element_Py = Element
2084 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2085 from _elementtree import *
2086 from _elementtree import _set_factories
2087except ImportError:
2088 pass
2089else:
2090 _set_factories(Comment, ProcessingInstruction)