Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/xml/etree/ElementTree.py: 17%

1"""Lightweight XML support for Python.

3 XML is an inherently hierarchical data format, and the most natural way to

4 represent it is with a tree. This module has two classes for this purpose:

6 1. ElementTree represents the whole XML document as a tree and

8 2. Element represents a single node in this tree.

10 Interactions with the whole document (reading and writing to/from files) are

11 usually done on the ElementTree level. Interactions with a single XML element

12 and its sub-elements are done on the Element level.

14 Element is a flexible container object designed to store hierarchical data

15 structures in memory. It can be described as a cross between a list and a

16 dictionary. Each Element has a number of properties associated with it:

18 'tag' - a string containing the element's name.

20 'attributes' - a Python dictionary storing the element's attributes.

22 'text' - a string containing the element's text content.

24 'tail' - an optional string containing text after the element's end tag.

26 And a number of child elements stored in a Python sequence.

28 To create an element instance, use the Element constructor,

29 or the SubElement factory function.

31 You can also use the ElementTree class to wrap an element structure

32 and convert it to and from XML.

34"""

36#---------------------------------------------------------------------

37# Licensed to PSF under a Contributor Agreement.

38# See http://www.python.org/psf/license for licensing details.

39#

40# ElementTree

42#

43# fredrik@pythonware.com

44# http://www.pythonware.com

45# --------------------------------------------------------------------

46# The ElementTree toolkit is

47#

49#

50# By obtaining, using, and/or copying this software and/or its

51# associated documentation, you agree that you have read, understood,

52# and will comply with the following terms and conditions:

53#

54# Permission to use, copy, modify, and distribute this software and

55# its associated documentation for any purpose and without fee is

56# hereby granted, provided that the above copyright notice appears in

57# all copies, and that both that copyright notice and this permission

58# notice appear in supporting documentation, and that the name of

59# Secret Labs AB or the author not be used in advertising or publicity

60# pertaining to distribution of the software without specific, written

61# prior permission.

62#

63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70# OF THIS SOFTWARE.

71# --------------------------------------------------------------------

73__all__ = [

74 # public symbols

75 "Comment",

76 "dump",

77 "Element", "ElementTree",

78 "fromstring", "fromstringlist",

79 "indent", "iselement", "iterparse",

80 "parse", "ParseError",

81 "PI", "ProcessingInstruction",

82 "QName",

83 "SubElement",

84 "tostring", "tostringlist",

85 "TreeBuilder",

86 "VERSION",

87 "XML", "XMLID",

88 "XMLParser", "XMLPullParser",

89 "register_namespace",

90 "canonicalize", "C14NWriterTarget",

91 ]

93VERSION = "1.3.0"

95import sys

96import re

97import warnings

98import io

99import collections

100import collections.abc

101import contextlib

102

103from . import ElementPath

104

105

106class ParseError(SyntaxError):

107 """An error when parsing an XML document.

108

109 In addition to its exception value, a ParseError contains

110 two extra attributes:

111 'code' - the specific exception code

112 'position' - the line and column of the error

113

114 """

115 pass

116

117# --------------------------------------------------------------------

118

119

120def iselement(element):

121 """Return True if *element* appears to be an Element."""

122 return hasattr(element, 'tag')

123

124

125class Element:

126 """An XML element.

127

128 This class is the reference implementation of the Element interface.

129

130 An element's length is its number of subelements. That means if you

131 want to check if an element is truly empty, you should check BOTH

132 its length AND its text attribute.

133

134 The element tag, attribute names, and attribute values can be either

135 bytes or strings.

136

137 *tag* is the element name. *attrib* is an optional dictionary containing

138 element attributes. *extra* are additional element attributes given as

139 keyword arguments.

140

141 Example form:

142 <tag attrib>text<child/>...</tag>tail

143

144 """

145

146 tag = None

147 """The element's name."""

148

149 attrib = None

150 """Dictionary of the element's attributes."""

151

152 text = None

153 """

154 Text before first subelement. This is either a string or the value None.

155 Note that if there is no text, this attribute may be either

156 None or the empty string, depending on the parser.

157

158 """

159

160 tail = None

161 """

162 Text after this element's end tag, but before the next sibling element's

163 start tag. This is either a string or the value None. Note that if there

164 was no text, this attribute may be either None or an empty string,

165 depending on the parser.

166

167 """

168

169 def __init__(self, tag, attrib={}, **extra):

170 if not isinstance(attrib, dict):

171 raise TypeError("attrib must be dict, not %s" % (

172 attrib.__class__.__name__,))

173 self.tag = tag

174 self.attrib = {**attrib, **extra}

175 self._children = []

176

177 def __repr__(self):

178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

179

180 def makeelement(self, tag, attrib):

181 """Create a new element with the same type.

182

183 *tag* is a string containing the element name.

184 *attrib* is a dictionary containing the element attributes.

185

186 Do not call this method, use the SubElement factory function instead.

187

188 """

189 return self.__class__(tag, attrib)

190

191 def copy(self):

192 """Return copy of current element.

193

194 This creates a shallow copy. Subelements will be shared with the

195 original tree.

196

197 """

198 warnings.warn(

199 "elem.copy() is deprecated. Use copy.copy(elem) instead.",

200 DeprecationWarning

201 )

202 return self.__copy__()

203

204 def __copy__(self):

205 elem = self.makeelement(self.tag, self.attrib)

206 elem.text = self.text

207 elem.tail = self.tail

208 elem[:] = self

209 return elem

210

211 def __len__(self):

212 return len(self._children)

213

214 def __bool__(self):

215 warnings.warn(

216 "The behavior of this method will change in future versions. "

217 "Use specific 'len(elem)' or 'elem is not None' test instead.",

218 FutureWarning, stacklevel=2

219 )

220 return len(self._children) != 0 # emulate old behaviour, for now

221

222 def __getitem__(self, index):

223 return self._children[index]

224

225 def __setitem__(self, index, element):

226 if isinstance(index, slice):

227 for elt in element:

228 self._assert_is_element(elt)

229 else:

230 self._assert_is_element(element)

231 self._children[index] = element

232

233 def __delitem__(self, index):

234 del self._children[index]

235

236 def append(self, subelement):

237 """Add *subelement* to the end of this element.

238

239 The new element will appear in document order after the last existing

240 subelement (or directly after the text, if it's the first subelement),

241 but before the end tag for this element.

242

243 """

244 self._assert_is_element(subelement)

245 self._children.append(subelement)

246

247 def extend(self, elements):

248 """Append subelements from a sequence.

249

250 *elements* is a sequence with zero or more elements.

251

252 """

253 for element in elements:

254 self._assert_is_element(element)

255 self._children.append(element)

256

257 def insert(self, index, subelement):

258 """Insert *subelement* at position *index*."""

259 self._assert_is_element(subelement)

260 self._children.insert(index, subelement)

261

262 def _assert_is_element(self, e):

263 # Need to refer to the actual Python implementation, not the

264 # shadowing C implementation.

265 if not isinstance(e, _Element_Py):

266 raise TypeError('expected an Element, not %s' % type(e).__name__)

267

268 def remove(self, subelement):

269 """Remove matching subelement.

270

271 Unlike the find methods, this method compares elements based on

272 identity, NOT ON tag value or contents. To remove subelements by

273 other means, the easiest way is to use a list comprehension to

274 select what elements to keep, and then use slice assignment to update

275 the parent element.

276

277 ValueError is raised if a matching element could not be found.

278

279 """

280 # assert iselement(element)

281 self._children.remove(subelement)

282

283 def find(self, path, namespaces=None):

284 """Find first matching element by tag name or path.

285

286 *path* is a string having either an element tag or an XPath,

287 *namespaces* is an optional mapping from namespace prefix to full name.

288

289 Return the first matching element, or None if no element was found.

290

291 """

292 return ElementPath.find(self, path, namespaces)

293

294 def findtext(self, path, default=None, namespaces=None):

295 """Find text for first matching element by tag name or path.

296

297 *path* is a string having either an element tag or an XPath,

298 *default* is the value to return if the element was not found,

299 *namespaces* is an optional mapping from namespace prefix to full name.

300

301 Return text content of first matching element, or default value if

302 none was found. Note that if an element is found having no text

303 content, the empty string is returned.

304

305 """

306 return ElementPath.findtext(self, path, default, namespaces)

307

308 def findall(self, path, namespaces=None):

309 """Find all matching subelements by tag name or path.

310

311 *path* is a string having either an element tag or an XPath,

312 *namespaces* is an optional mapping from namespace prefix to full name.

313

314 Returns list containing all matching elements in document order.

315

316 """

317 return ElementPath.findall(self, path, namespaces)

318

319 def iterfind(self, path, namespaces=None):

320 """Find all matching subelements by tag name or path.

321

322 *path* is a string having either an element tag or an XPath,

323 *namespaces* is an optional mapping from namespace prefix to full name.

324

325 Return an iterable yielding all matching elements in document order.

326

327 """

328 return ElementPath.iterfind(self, path, namespaces)

329

330 def clear(self):

331 """Reset element.

332

333 This function removes all subelements, clears all attributes, and sets

334 the text and tail attributes to None.

335

336 """

337 self.attrib.clear()

338 self._children = []

339 self.text = self.tail = None

340

341 def get(self, key, default=None):

342 """Get element attribute.

343

344 Equivalent to attrib.get, but some implementations may handle this a

345 bit more efficiently. *key* is what attribute to look for, and

346 *default* is what to return if the attribute was not found.

347

348 Returns a string containing the attribute value, or the default if

349 attribute was not found.

350

351 """

352 return self.attrib.get(key, default)

353

354 def set(self, key, value):

355 """Set element attribute.

356

357 Equivalent to attrib[key] = value, but some implementations may handle

358 this a bit more efficiently. *key* is what attribute to set, and

359 *value* is the attribute value to set it to.

360

361 """

362 self.attrib[key] = value

363

364 def keys(self):

365 """Get list of attribute names.

366

367 Names are returned in an arbitrary order, just like an ordinary

368 Python dict. Equivalent to attrib.keys()

369

370 """

371 return self.attrib.keys()

372

373 def items(self):

374 """Get element attributes as a sequence.

375

376 The attributes are returned in arbitrary order. Equivalent to

377 attrib.items().

378

379 Return a list of (name, value) tuples.

380

381 """

382 return self.attrib.items()

383

384 def iter(self, tag=None):

385 """Create tree iterator.

386

387 The iterator loops over the element and all subelements in document

388 order, returning all elements with a matching tag.

389

390 If the tree structure is modified during iteration, new or removed

391 elements may or may not be included. To get a stable set, use the

392 list() function on the iterator, and loop over the resulting list.

393

394 *tag* is what tags to look for (default is to return all elements)

395

396 Return an iterator containing all the matching elements.

397

398 """

399 if tag == "*":

400 tag = None

401 if tag is None or self.tag == tag:

402 yield self

403 for e in self._children:

404 yield from e.iter(tag)

405

406 def itertext(self):

407 """Create text iterator.

408

409 The iterator loops over the element and all subelements in document

410 order, returning all inner text.

411

412 """

413 tag = self.tag

414 if not isinstance(tag, str) and tag is not None:

415 return

416 t = self.text

417 if t:

418 yield t

419 for e in self:

420 yield from e.itertext()

421 t = e.tail

422 if t:

423 yield t

424

425

426def SubElement(parent, tag, attrib={}, **extra):

427 """Subelement factory which creates an element instance, and appends it

428 to an existing parent.

429

430 The element tag, attribute names, and attribute values can be either

431 bytes or Unicode strings.

432

433 *parent* is the parent element, *tag* is the subelements name, *attrib* is

434 an optional directory containing element attributes, *extra* are

435 additional attributes given as keyword arguments.

436

437 """

438 attrib = {**attrib, **extra}

439 element = parent.makeelement(tag, attrib)

440 parent.append(element)

441 return element

442

443

444def Comment(text=None):

445 """Comment element factory.

446

447 This function creates a special element which the standard serializer

448 serializes as an XML comment.

449

450 *text* is a string containing the comment string.

451

452 """

453 element = Element(Comment)

454 element.text = text

455 return element

456

457

458def ProcessingInstruction(target, text=None):

459 """Processing Instruction element factory.

460

461 This function creates a special element which the standard serializer

462 serializes as an XML comment.

463

464 *target* is a string containing the processing instruction, *text* is a

465 string containing the processing instruction contents, if any.

466

467 """

468 element = Element(ProcessingInstruction)

469 element.text = target

470 if text:

471 element.text = element.text + " " + text

472 return element

473

474PI = ProcessingInstruction

475

476

477class QName:

478 """Qualified name wrapper.

479

480 This class can be used to wrap a QName attribute value in order to get

481 proper namespace handing on output.

482

483 *text_or_uri* is a string containing the QName value either in the form

484 {uri}local, or if the tag argument is given, the URI part of a QName.

485

486 *tag* is an optional argument which if given, will make the first

487 argument (text_or_uri) be interpreted as a URI, and this argument (tag)

488 be interpreted as a local name.

489

490 """

491 def __init__(self, text_or_uri, tag=None):

492 if tag:

493 text_or_uri = "{%s}%s" % (text_or_uri, tag)

494 self.text = text_or_uri

495 def __str__(self):

496 return self.text

497 def __repr__(self):

498 return '<%s %r>' % (self.__class__.__name__, self.text)

499 def __hash__(self):

500 return hash(self.text)

501 def __le__(self, other):

502 if isinstance(other, QName):

503 return self.text <= other.text

504 return self.text <= other

505 def __lt__(self, other):

506 if isinstance(other, QName):

507 return self.text < other.text

508 return self.text < other

509 def __ge__(self, other):

510 if isinstance(other, QName):

511 return self.text >= other.text

512 return self.text >= other

513 def __gt__(self, other):

514 if isinstance(other, QName):

515 return self.text > other.text

516 return self.text > other

517 def __eq__(self, other):

518 if isinstance(other, QName):

519 return self.text == other.text

520 return self.text == other

521

522# --------------------------------------------------------------------

523

524

525class ElementTree:

526 """An XML element hierarchy.

527

528 This class also provides support for serialization to and from

529 standard XML.

530

531 *element* is an optional root element node,

532 *file* is an optional file handle or file name of an XML file whose

533 contents will be used to initialize the tree with.

534

535 """

536 def __init__(self, element=None, file=None):

537 # assert element is None or iselement(element)

538 self._root = element # first node

539 if file:

540 self.parse(file)

541

542 def getroot(self):

543 """Return root element of this tree."""

544 return self._root

545

546 def _setroot(self, element):

547 """Replace root element of this tree.

548

549 This will discard the current contents of the tree and replace it

550 with the given element. Use with care!

551

552 """

553 # assert iselement(element)

554 self._root = element

555

556 def parse(self, source, parser=None):

557 """Load external XML document into element tree.

558

559 *source* is a file name or file object, *parser* is an optional parser

560 instance that defaults to XMLParser.

561

562 ParseError is raised if the parser fails to parse the document.

563

564 Returns the root element of the given source document.

565

566 """

567 close_source = False

568 if not hasattr(source, "read"):

569 source = open(source, "rb")

570 close_source = True

571 try:

572 if parser is None:

573 # If no parser was specified, create a default XMLParser

574 parser = XMLParser()

575 if hasattr(parser, '_parse_whole'):

576 # The default XMLParser, when it comes from an accelerator,

577 # can define an internal _parse_whole API for efficiency.

578 # It can be used to parse the whole source without feeding

579 # it with chunks.

580 self._root = parser._parse_whole(source)

581 return self._root

582 while True:

583 data = source.read(65536)

584 if not data:

585 break

586 parser.feed(data)

587 self._root = parser.close()

588 return self._root

589 finally:

590 if close_source:

591 source.close()

592

593 def iter(self, tag=None):

594 """Create and return tree iterator for the root element.

595

596 The iterator loops over all elements in this tree, in document order.

597

598 *tag* is a string with the tag name to iterate over

599 (default is to return all elements).

600

601 """

602 # assert self._root is not None

603 return self._root.iter(tag)

604

605 def find(self, path, namespaces=None):

606 """Find first matching element by tag name or path.

607

608 Same as getroot().find(path), which is Element.find()

609

610 *path* is a string having either an element tag or an XPath,

611 *namespaces* is an optional mapping from namespace prefix to full name.

612

613 Return the first matching element, or None if no element was found.

614

615 """

616 # assert self._root is not None

617 if path[:1] == "/":

618 path = "." + path

619 warnings.warn(

620 "This search is broken in 1.3 and earlier, and will be "

621 "fixed in a future version. If you rely on the current "

622 "behaviour, change it to %r" % path,

623 FutureWarning, stacklevel=2

624 )

625 return self._root.find(path, namespaces)

626

627 def findtext(self, path, default=None, namespaces=None):

628 """Find first matching element by tag name or path.

629

630 Same as getroot().findtext(path), which is Element.findtext()

631

632 *path* is a string having either an element tag or an XPath,

633 *namespaces* is an optional mapping from namespace prefix to full name.

634

635 Return the first matching element, or None if no element was found.

636

637 """

638 # assert self._root is not None

639 if path[:1] == "/":

640 path = "." + path

641 warnings.warn(

642 "This search is broken in 1.3 and earlier, and will be "

643 "fixed in a future version. If you rely on the current "

644 "behaviour, change it to %r" % path,

645 FutureWarning, stacklevel=2

646 )

647 return self._root.findtext(path, default, namespaces)

648

649 def findall(self, path, namespaces=None):

650 """Find all matching subelements by tag name or path.

651

652 Same as getroot().findall(path), which is Element.findall().

653

654 *path* is a string having either an element tag or an XPath,

655 *namespaces* is an optional mapping from namespace prefix to full name.

656

657 Return list containing all matching elements in document order.

658

659 """

660 # assert self._root is not None

661 if path[:1] == "/":

662 path = "." + path

663 warnings.warn(

664 "This search is broken in 1.3 and earlier, and will be "

665 "fixed in a future version. If you rely on the current "

666 "behaviour, change it to %r" % path,

667 FutureWarning, stacklevel=2

668 )

669 return self._root.findall(path, namespaces)

670

671 def iterfind(self, path, namespaces=None):

672 """Find all matching subelements by tag name or path.

673

674 Same as getroot().iterfind(path), which is element.iterfind()

675

676 *path* is a string having either an element tag or an XPath,

677 *namespaces* is an optional mapping from namespace prefix to full name.

678

679 Return an iterable yielding all matching elements in document order.

680

681 """

682 # assert self._root is not None

683 if path[:1] == "/":

684 path = "." + path

685 warnings.warn(

686 "This search is broken in 1.3 and earlier, and will be "

687 "fixed in a future version. If you rely on the current "

688 "behaviour, change it to %r" % path,

689 FutureWarning, stacklevel=2

690 )

691 return self._root.iterfind(path, namespaces)

692

693 def write(self, file_or_filename,

694 encoding=None,

695 xml_declaration=None,

696 default_namespace=None,

697 method=None, *,

698 short_empty_elements=True):

699 """Write element tree to a file as XML.

700

701 Arguments:

702 *file_or_filename* -- file name or a file object opened for writing

703

704 *encoding* -- the output encoding (default: US-ASCII)

705

706 *xml_declaration* -- bool indicating if an XML declaration should be

707 added to the output. If None, an XML declaration

708 is added if encoding IS NOT either of:

709 US-ASCII, UTF-8, or Unicode

710

711 *default_namespace* -- sets the default XML namespace (for "xmlns")

712

713 *method* -- either "xml" (default), "html, "text", or "c14n"

714

715 *short_empty_elements* -- controls the formatting of elements

716 that contain no content. If True (default)

717 they are emitted as a single self-closed

718 tag, otherwise they are emitted as a pair

719 of start/end tags

720

721 """

722 if not method:

723 method = "xml"

724 elif method not in _serialize:

725 raise ValueError("unknown method %r" % method)

726 if not encoding:

727 if method == "c14n":

728 encoding = "utf-8"

729 else:

730 encoding = "us-ascii"

731 enc_lower = encoding.lower()

732 with _get_writer(file_or_filename, enc_lower) as write:

733 if method == "xml" and (xml_declaration or

734 (xml_declaration is None and

735 enc_lower not in ("utf-8", "us-ascii", "unicode"))):

736 declared_encoding = encoding

737 if enc_lower == "unicode":

738 # Retrieve the default encoding for the xml declaration

739 import locale

740 declared_encoding = locale.getpreferredencoding()

741 write("<?xml version='1.0' encoding='%s'?>\n" % (

742 declared_encoding,))

743 if method == "text":

744 _serialize_text(write, self._root)

745 else:

746 qnames, namespaces = _namespaces(self._root, default_namespace)

747 serialize = _serialize[method]

748 serialize(write, self._root, qnames, namespaces,

749 short_empty_elements=short_empty_elements)

750

751 def write_c14n(self, file):

752 # lxml.etree compatibility. use output method instead

753 return self.write(file, method="c14n")

754

755# --------------------------------------------------------------------

756# serialization support

757

758@contextlib.contextmanager

759def _get_writer(file_or_filename, encoding):

760 # returns text write method and release all resources after using

761 try:

762 write = file_or_filename.write

763 except AttributeError:

764 # file_or_filename is a file name

765 if encoding == "unicode":

766 file = open(file_or_filename, "w")

767 else:

768 file = open(file_or_filename, "w", encoding=encoding,

769 errors="xmlcharrefreplace")

770 with file:

771 yield file.write

772 else:

773 # file_or_filename is a file-like object

774 # encoding determines if it is a text or binary writer

775 if encoding == "unicode":

776 # use a text writer as is

777 yield write

778 else:

779 # wrap a binary writer with TextIOWrapper

780 with contextlib.ExitStack() as stack:

781 if isinstance(file_or_filename, io.BufferedIOBase):

782 file = file_or_filename

783 elif isinstance(file_or_filename, io.RawIOBase):

784 file = io.BufferedWriter(file_or_filename)

785 # Keep the original file open when the BufferedWriter is

786 # destroyed

787 stack.callback(file.detach)

788 else:

789 # This is to handle passed objects that aren't in the

790 # IOBase hierarchy, but just have a write method

791 file = io.BufferedIOBase()

792 file.writable = lambda: True

793 file.write = write

794 try:

795 # TextIOWrapper uses this methods to determine

796 # if BOM (for UTF-16, etc) should be added

797 file.seekable = file_or_filename.seekable

798 file.tell = file_or_filename.tell

799 except AttributeError:

800 pass

801 file = io.TextIOWrapper(file,

802 encoding=encoding,

803 errors="xmlcharrefreplace",

804 newline="\n")

805 # Keep the original file open when the TextIOWrapper is

806 # destroyed

807 stack.callback(file.detach)

808 yield file.write

809

810def _namespaces(elem, default_namespace=None):

811 # identify namespaces used in this tree

812

813 # maps qnames to *encoded* prefix:local names

814 qnames = {None: None}

815

816 # maps uri:s to prefixes

817 namespaces = {}

818 if default_namespace:

819 namespaces[default_namespace] = ""

820

821 def add_qname(qname):

822 # calculate serialized qname representation

823 try:

824 if qname[:1] == "{":

825 uri, tag = qname[1:].rsplit("}", 1)

826 prefix = namespaces.get(uri)

827 if prefix is None:

828 prefix = _namespace_map.get(uri)

829 if prefix is None:

830 prefix = "ns%d" % len(namespaces)

831 if prefix != "xml":

832 namespaces[uri] = prefix

833 if prefix:

834 qnames[qname] = "%s:%s" % (prefix, tag)

835 else:

836 qnames[qname] = tag # default element

837 else:

838 if default_namespace:

839 # FIXME: can this be handled in XML 1.0?

840 raise ValueError(

841 "cannot use non-qualified names with "

842 "default_namespace option"

843 )

844 qnames[qname] = qname

845 except TypeError:

846 _raise_serialization_error(qname)

847

848 # populate qname and namespaces table

849 for elem in elem.iter():

850 tag = elem.tag

851 if isinstance(tag, QName):

852 if tag.text not in qnames:

853 add_qname(tag.text)

854 elif isinstance(tag, str):

855 if tag not in qnames:

856 add_qname(tag)

857 elif tag is not None and tag is not Comment and tag is not PI:

858 _raise_serialization_error(tag)

859 for key, value in elem.items():

860 if isinstance(key, QName):

861 key = key.text

862 if key not in qnames:

863 add_qname(key)

864 if isinstance(value, QName) and value.text not in qnames:

865 add_qname(value.text)

866 text = elem.text

867 if isinstance(text, QName) and text.text not in qnames:

868 add_qname(text.text)

869 return qnames, namespaces

870

871def _serialize_xml(write, elem, qnames, namespaces,

872 short_empty_elements, **kwargs):

873 tag = elem.tag

874 text = elem.text

875 if tag is Comment:

876 write("" % text)

877 elif tag is ProcessingInstruction:

878 write("<?%s?>" % text)

879 else:

880 tag = qnames[tag]

881 if tag is None:

882 if text:

883 write(_escape_cdata(text))

884 for e in elem:

885 _serialize_xml(write, e, qnames, None,

886 short_empty_elements=short_empty_elements)

887 else:

888 write("<" + tag)

889 items = list(elem.items())

890 if items or namespaces:

891 if namespaces:

892 for v, k in sorted(namespaces.items(),

893 key=lambda x: x[1]): # sort on prefix

894 if k:

895 k = ":" + k

896 write(" xmlns%s=\"%s\"" % (

897 k,

898 _escape_attrib(v)

899 ))

900 for k, v in items:

901 if isinstance(k, QName):

902 k = k.text

903 if isinstance(v, QName):

904 v = qnames[v.text]

905 else:

906 v = _escape_attrib(v)

907 write(" %s=\"%s\"" % (qnames[k], v))

908 if text or len(elem) or not short_empty_elements:

909 write(">")

910 if text:

911 write(_escape_cdata(text))

912 for e in elem:

913 _serialize_xml(write, e, qnames, None,

914 short_empty_elements=short_empty_elements)

915 write("</" + tag + ">")

916 else:

917 write(" />")

918 if elem.tail:

919 write(_escape_cdata(elem.tail))

920

921HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

922 "img", "input", "isindex", "link", "meta", "param")

923

924try:

925 HTML_EMPTY = set(HTML_EMPTY)

926except NameError:

927 pass

928

929def _serialize_html(write, elem, qnames, namespaces, **kwargs):

930 tag = elem.tag

931 text = elem.text

932 if tag is Comment:

933 write("" % _escape_cdata(text))

934 elif tag is ProcessingInstruction:

935 write("<?%s?>" % _escape_cdata(text))

936 else:

937 tag = qnames[tag]

938 if tag is None:

939 if text:

940 write(_escape_cdata(text))

941 for e in elem:

942 _serialize_html(write, e, qnames, None)

943 else:

944 write("<" + tag)

945 items = list(elem.items())

946 if items or namespaces:

947 if namespaces:

948 for v, k in sorted(namespaces.items(),

949 key=lambda x: x[1]): # sort on prefix

950 if k:

951 k = ":" + k

952 write(" xmlns%s=\"%s\"" % (

953 k,

954 _escape_attrib(v)

955 ))

956 for k, v in items:

957 if isinstance(k, QName):

958 k = k.text

959 if isinstance(v, QName):

960 v = qnames[v.text]

961 else:

962 v = _escape_attrib_html(v)

963 # FIXME: handle boolean attributes

964 write(" %s=\"%s\"" % (qnames[k], v))

965 write(">")

966 ltag = tag.lower()

967 if text:

968 if ltag == "script" or ltag == "style":

969 write(text)

970 else:

971 write(_escape_cdata(text))

972 for e in elem:

973 _serialize_html(write, e, qnames, None)

974 if ltag not in HTML_EMPTY:

975 write("</" + tag + ">")

976 if elem.tail:

977 write(_escape_cdata(elem.tail))

978

979def _serialize_text(write, elem):

980 for part in elem.itertext():

981 write(part)

982 if elem.tail:

983 write(elem.tail)

984

985_serialize = {

986 "xml": _serialize_xml,

987 "html": _serialize_html,

988 "text": _serialize_text,

989# this optional method is imported at the end of the module

990# "c14n": _serialize_c14n,

991}

992

993

994def register_namespace(prefix, uri):

995 """Register a namespace prefix.

996

997 The registry is global, and any existing mapping for either the

998 given prefix or the namespace URI will be removed.

999

1000 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1001 attributes in this namespace will be serialized with prefix if possible.

1002

1003 ValueError is raised if prefix is reserved or is invalid.

1004

1005 """

1006 if re.match(r"ns\d+$", prefix):

1007 raise ValueError("Prefix format reserved for internal use")

1008 for k, v in list(_namespace_map.items()):

1009 if k == uri or v == prefix:

1010 del _namespace_map[k]

1011 _namespace_map[uri] = prefix

1012

1013_namespace_map = {

1014 # "well-known" namespace prefixes

1015 "http://www.w3.org/XML/1998/namespace": "xml",

1016 "http://www.w3.org/1999/xhtml": "html",

1017 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1018 "http://schemas.xmlsoap.org/wsdl/": "wsdl",

1019 # xml schema

1020 "http://www.w3.org/2001/XMLSchema": "xs",

1021 "http://www.w3.org/2001/XMLSchema-instance": "xsi",

1022 # dublin core

1023 "http://purl.org/dc/elements/1.1/": "dc",

1024}

1025# For tests and troubleshooting

1026register_namespace._namespace_map = _namespace_map

1027

1028def _raise_serialization_error(text):

1029 raise TypeError(

1030 "cannot serialize %r (type %s)" % (text, type(text).__name__)

1031 )

1032

1033def _escape_cdata(text):

1034 # escape character data

1035 try:

1036 # it's worth avoiding do-nothing calls for strings that are

1037 # shorter than 500 characters, or so. assume that's, by far,

1038 # the most common case in most applications.

1039 if "&" in text:

1040 text = text.replace("&", "&")

1041 if "<" in text:

1042 text = text.replace("<", "<")

1043 if ">" in text:

1044 text = text.replace(">", ">")

1045 return text

1046 except (TypeError, AttributeError):

1047 _raise_serialization_error(text)

1048

1049def _escape_attrib(text):

1050 # escape attribute value

1051 try:

1052 if "&" in text:

1053 text = text.replace("&", "&")

1054 if "<" in text:

1055 text = text.replace("<", "<")

1056 if ">" in text:

1057 text = text.replace(">", ">")

1058 if "\"" in text:

1059 text = text.replace("\"", """)

1060 # Although section 2.11 of the XML specification states that CR or

1061 # CR LN should be replaced with just LN, it applies only to EOLNs

1062 # which take part of organizing file into lines. Within attributes,

1063 # we are replacing these with entity numbers, so they do not count.

1064 # http://www.w3.org/TR/REC-xml/#sec-line-ends

1065 # The current solution, contained in following six lines, was

1066 # discussed in issue 17582 and 39011.

1067 if "\r" in text:

1068 text = text.replace("\r", "")

1069 if "\n" in text:

1070 text = text.replace("\n", "
")

1071 if "\t" in text:

1072 text = text.replace("\t", "	")

1073 return text

1074 except (TypeError, AttributeError):

1075 _raise_serialization_error(text)

1076

1077def _escape_attrib_html(text):

1078 # escape attribute value

1079 try:

1080 if "&" in text:

1081 text = text.replace("&", "&")

1082 if ">" in text:

1083 text = text.replace(">", ">")

1084 if "\"" in text:

1085 text = text.replace("\"", """)

1086 return text

1087 except (TypeError, AttributeError):

1088 _raise_serialization_error(text)

1089

1090# --------------------------------------------------------------------

1091

1092def tostring(element, encoding=None, method=None, *,

1093 xml_declaration=None, default_namespace=None,

1094 short_empty_elements=True):

1095 """Generate string representation of XML element.

1096

1097 All subelements are included. If encoding is "unicode", a string

1098 is returned. Otherwise a bytestring is returned.

1099

1100 *element* is an Element instance, *encoding* is an optional output

1101 encoding defaulting to US-ASCII, *method* is an optional output which can

1102 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*

1103 sets the default XML namespace (for "xmlns").

1104

1105 Returns an (optionally) encoded string containing the XML data.

1106

1107 """

1108 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

1109 ElementTree(element).write(stream, encoding,

1110 xml_declaration=xml_declaration,

1111 default_namespace=default_namespace,

1112 method=method,

1113 short_empty_elements=short_empty_elements)

1114 return stream.getvalue()

1115

1116class _ListDataStream(io.BufferedIOBase):

1117 """An auxiliary stream accumulating into a list reference."""

1118 def __init__(self, lst):

1119 self.lst = lst

1120

1121 def writable(self):

1122 return True

1123

1124 def seekable(self):

1125 return True

1126

1127 def write(self, b):

1128 self.lst.append(b)

1129

1130 def tell(self):

1131 return len(self.lst)

1132

1133def tostringlist(element, encoding=None, method=None, *,

1134 xml_declaration=None, default_namespace=None,

1135 short_empty_elements=True):

1136 lst = []

1137 stream = _ListDataStream(lst)

1138 ElementTree(element).write(stream, encoding,

1139 xml_declaration=xml_declaration,

1140 default_namespace=default_namespace,

1141 method=method,

1142 short_empty_elements=short_empty_elements)

1143 return lst

1144

1145

1146def dump(elem):

1147 """Write element tree or element structure to sys.stdout.

1148

1149 This function should be used for debugging only.

1150

1151 *elem* is either an ElementTree, or a single Element. The exact output

1152 format is implementation dependent. In this version, it's written as an

1153 ordinary XML file.

1154

1155 """

1156 # debugging

1157 if not isinstance(elem, ElementTree):

1158 elem = ElementTree(elem)

1159 elem.write(sys.stdout, encoding="unicode")

1160 tail = elem.getroot().tail

1161 if not tail or tail[-1] != "\n":

1162 sys.stdout.write("\n")

1163

1164

1165def indent(tree, space=" ", level=0):

1166 """Indent an XML document by inserting newlines and indentation space

1167 after elements.

1168

1169 *tree* is the ElementTree or Element to modify. The (root) element

1170 itself will not be changed, but the tail text of all elements in its

1171 subtree will be adapted.

1172

1173 *space* is the whitespace to insert for each indentation level, two

1174 space characters by default.

1175

1176 *level* is the initial indentation level. Setting this to a higher

1177 value than 0 can be used for indenting subtrees that are more deeply

1178 nested inside of a document.

1179 """

1180 if isinstance(tree, ElementTree):

1181 tree = tree.getroot()

1182 if level < 0:

1183 raise ValueError(f"Initial indentation level must be >= 0, got {level}")

1184 if not len(tree):

1185 return

1186

1187 # Reduce the memory consumption by reusing indentation strings.

1188 indentations = ["\n" + level * space]

1189

1190 def _indent_children(elem, level):

1191 # Start a new indentation level for the first child.

1192 child_level = level + 1

1193 try:

1194 child_indentation = indentations[child_level]

1195 except IndexError:

1196 child_indentation = indentations[level] + space

1197 indentations.append(child_indentation)

1198

1199 if not elem.text or not elem.text.strip():

1200 elem.text = child_indentation

1201

1202 for child in elem:

1203 if len(child):

1204 _indent_children(child, child_level)

1205 if not child.tail or not child.tail.strip():

1206 child.tail = child_indentation

1207

1208 # Dedent after the last child by overwriting the previous indentation.

1209 if not child.tail.strip():

1210 child.tail = indentations[level]

1211

1212 _indent_children(tree, 0)

1213

1214

1215# --------------------------------------------------------------------

1216# parsing

1217

1218

1219def parse(source, parser=None):

1220 """Parse XML document into element tree.

1221

1222 *source* is a filename or file object containing XML data,

1223 *parser* is an optional parser instance defaulting to XMLParser.

1224

1225 Return an ElementTree instance.

1226

1227 """

1228 tree = ElementTree()

1229 tree.parse(source, parser)

1230 return tree

1231

1232

1233def iterparse(source, events=None, parser=None):

1234 """Incrementally parse XML document into ElementTree.

1235

1236 This class also reports what's going on to the user based on the

1237 *events* it is initialized with. The supported events are the strings

1238 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1239 detailed namespace information). If *events* is omitted, only

1240 "end" events are reported.

1241

1242 *source* is a filename or file object containing XML data, *events* is

1243 a list of events to report back, *parser* is an optional parser instance.

1244

1245 Returns an iterator providing (event, elem) pairs.

1246

1247 """

1248 # Use the internal, undocumented _parser argument for now; When the

1249 # parser argument of iterparse is removed, this can be killed.

1250 pullparser = XMLPullParser(events=events, _parser=parser)

1251 def iterator():

1252 try:

1253 while True:

1254 yield from pullparser.read_events()

1255 # load event buffer

1256 data = source.read(16 * 1024)

1257 if not data:

1258 break

1259 pullparser.feed(data)

1260 root = pullparser._close_and_return_root()

1261 yield from pullparser.read_events()

1262 it.root = root

1263 finally:

1264 if close_source:

1265 source.close()

1266

1267 class IterParseIterator(collections.abc.Iterator):

1268 __next__ = iterator().__next__

1269 it = IterParseIterator()

1270 it.root = None

1271 del iterator, IterParseIterator

1272

1273 close_source = False

1274 if not hasattr(source, "read"):

1275 source = open(source, "rb")

1276 close_source = True

1277

1278 return it

1279

1280

1281class XMLPullParser:

1282

1283 def __init__(self, events=None, *, _parser=None):

1284 # The _parser argument is for internal use only and must not be relied

1285 # upon in user code. It will be removed in a future release.

1286 # See http://bugs.python.org/issue17741 for more details.

1287

1288 self._events_queue = collections.deque()

1289 self._parser = _parser or XMLParser(target=TreeBuilder())

1290 # wire up the parser for event reporting

1291 if events is None:

1292 events = ("end",)

1293 self._parser._setevents(self._events_queue, events)

1294

1295 def feed(self, data):

1296 """Feed encoded data to parser."""

1297 if self._parser is None:

1298 raise ValueError("feed() called after end of stream")

1299 if data:

1300 try:

1301 self._parser.feed(data)

1302 except SyntaxError as exc:

1303 self._events_queue.append(exc)

1304

1305 def _close_and_return_root(self):

1306 # iterparse needs this to set its root attribute properly :(

1307 root = self._parser.close()

1308 self._parser = None

1309 return root

1310

1311 def close(self):

1312 """Finish feeding data to parser.

1313

1314 Unlike XMLParser, does not return the root element. Use

1315 read_events() to consume elements from XMLPullParser.

1316 """

1317 self._close_and_return_root()

1318

1319 def read_events(self):

1320 """Return an iterator over currently available (event, elem) pairs.

1321

1322 Events are consumed from the internal event queue as they are

1323 retrieved from the iterator.

1324 """

1325 events = self._events_queue

1326 while events:

1327 event = events.popleft()

1328 if isinstance(event, Exception):

1329 raise event

1330 else:

1331 yield event

1332

1333

1334def XML(text, parser=None):

1335 """Parse XML document from string constant.

1336

1337 This function can be used to embed "XML Literals" in Python code.

1338

1339 *text* is a string containing XML data, *parser* is an

1340 optional parser instance, defaulting to the standard XMLParser.

1341

1342 Returns an Element instance.

1343

1344 """

1345 if not parser:

1346 parser = XMLParser(target=TreeBuilder())

1347 parser.feed(text)

1348 return parser.close()

1349

1350

1351def XMLID(text, parser=None):

1352 """Parse XML document from string constant for its IDs.

1353

1354 *text* is a string containing XML data, *parser* is an

1355 optional parser instance, defaulting to the standard XMLParser.

1356

1357 Returns an (Element, dict) tuple, in which the

1358 dict maps element id:s to elements.

1359

1360 """

1361 if not parser:

1362 parser = XMLParser(target=TreeBuilder())

1363 parser.feed(text)

1364 tree = parser.close()

1365 ids = {}

1366 for elem in tree.iter():

1367 id = elem.get("id")

1368 if id:

1369 ids[id] = elem

1370 return tree, ids

1371

1372# Parse XML document from string constant. Alias for XML().

1373fromstring = XML

1374

1375def fromstringlist(sequence, parser=None):

1376 """Parse XML document from sequence of string fragments.

1377

1378 *sequence* is a list of other sequence, *parser* is an optional parser

1379 instance, defaulting to the standard XMLParser.

1380

1381 Returns an Element instance.

1382

1383 """

1384 if not parser:

1385 parser = XMLParser(target=TreeBuilder())

1386 for text in sequence:

1387 parser.feed(text)

1388 return parser.close()

1389

1390# --------------------------------------------------------------------

1391

1392

1393class TreeBuilder:

1394 """Generic element structure builder.

1395

1396 This builder converts a sequence of start, data, and end method

1397 calls to a well-formed element structure.

1398

1399 You can use this class to build an element structure using a custom XML

1400 parser, or a parser for some other XML-like format.

1401

1402 *element_factory* is an optional element factory which is called

1403 to create new Element instances, as necessary.

1404

1405 *comment_factory* is a factory to create comments to be used instead of

1406 the standard factory. If *insert_comments* is false (the default),

1407 comments will not be inserted into the tree.

1408

1409 *pi_factory* is a factory to create processing instructions to be used

1410 instead of the standard factory. If *insert_pis* is false (the default),

1411 processing instructions will not be inserted into the tree.

1412 """

1413 def __init__(self, element_factory=None, *,

1414 comment_factory=None, pi_factory=None,

1415 insert_comments=False, insert_pis=False):

1416 self._data = [] # data collector

1417 self._elem = [] # element stack

1418 self._last = None # last element

1419 self._root = None # root element

1420 self._tail = None # true if we're after an end tag

1421 if comment_factory is None:

1422 comment_factory = Comment

1423 self._comment_factory = comment_factory

1424 self.insert_comments = insert_comments

1425 if pi_factory is None:

1426 pi_factory = ProcessingInstruction

1427 self._pi_factory = pi_factory

1428 self.insert_pis = insert_pis

1429 if element_factory is None:

1430 element_factory = Element

1431 self._factory = element_factory

1432

1433 def close(self):

1434 """Flush builder buffers and return toplevel document Element."""

1435 assert len(self._elem) == 0, "missing end tags"

1436 assert self._root is not None, "missing toplevel element"

1437 return self._root

1438

1439 def _flush(self):

1440 if self._data:

1441 if self._last is not None:

1442 text = "".join(self._data)

1443 if self._tail:

1444 assert self._last.tail is None, "internal error (tail)"

1445 self._last.tail = text

1446 else:

1447 assert self._last.text is None, "internal error (text)"

1448 self._last.text = text

1449 self._data = []

1450

1451 def data(self, data):

1452 """Add text to current element."""

1453 self._data.append(data)

1454

1455 def start(self, tag, attrs):

1456 """Open new element and return it.

1457

1458 *tag* is the element name, *attrs* is a dict containing element

1459 attributes.

1460

1461 """

1462 self._flush()

1463 self._last = elem = self._factory(tag, attrs)

1464 if self._elem:

1465 self._elem[-1].append(elem)

1466 elif self._root is None:

1467 self._root = elem

1468 self._elem.append(elem)

1469 self._tail = 0

1470 return elem

1471

1472 def end(self, tag):

1473 """Close and return current Element.

1474

1475 *tag* is the element name.

1476

1477 """

1478 self._flush()

1479 self._last = self._elem.pop()

1480 assert self._last.tag == tag,\

1481 "end tag mismatch (expected %s, got %s)" % (

1482 self._last.tag, tag)

1483 self._tail = 1

1484 return self._last

1485

1486 def comment(self, text):

1487 """Create a comment using the comment_factory.

1488

1489 *text* is the text of the comment.

1490 """

1491 return self._handle_single(

1492 self._comment_factory, self.insert_comments, text)

1493

1494 def pi(self, target, text=None):

1495 """Create a processing instruction using the pi_factory.

1496

1497 *target* is the target name of the processing instruction.

1498 *text* is the data of the processing instruction, or ''.

1499 """

1500 return self._handle_single(

1501 self._pi_factory, self.insert_pis, target, text)

1502

1503 def _handle_single(self, factory, insert, *args):

1504 elem = factory(*args)

1505 if insert:

1506 self._flush()

1507 self._last = elem

1508 if self._elem:

1509 self._elem[-1].append(elem)

1510 self._tail = 1

1511 return elem

1512

1513

1514# also see ElementTree and TreeBuilder

1515class XMLParser:

1516 """Element structure builder for XML source data based on the expat parser.

1517

1518 *target* is an optional target object which defaults to an instance of the

1519 standard TreeBuilder class, *encoding* is an optional encoding string

1520 which if given, overrides the encoding specified in the XML file:

1521 http://www.iana.org/assignments/character-sets

1522

1523 """

1524

1525 def __init__(self, *, target=None, encoding=None):

1526 try:

1527 from xml.parsers import expat

1528 except ImportError:

1529 try:

1530 import pyexpat as expat

1531 except ImportError:

1532 raise ImportError(

1533 "No module named expat; use SimpleXMLTreeBuilder instead"

1534 )

1535 parser = expat.ParserCreate(encoding, "}")

1536 if target is None:

1537 target = TreeBuilder()

1538 # underscored names are provided for compatibility only

1539 self.parser = self._parser = parser

1540 self.target = self._target = target

1541 self._error = expat.error

1542 self._names = {} # name memo cache

1543 # main callbacks

1544 parser.DefaultHandlerExpand = self._default

1545 if hasattr(target, 'start'):

1546 parser.StartElementHandler = self._start

1547 if hasattr(target, 'end'):

1548 parser.EndElementHandler = self._end

1549 if hasattr(target, 'start_ns'):

1550 parser.StartNamespaceDeclHandler = self._start_ns

1551 if hasattr(target, 'end_ns'):

1552 parser.EndNamespaceDeclHandler = self._end_ns

1553 if hasattr(target, 'data'):

1554 parser.CharacterDataHandler = target.data

1555 # miscellaneous callbacks

1556 if hasattr(target, 'comment'):

1557 parser.CommentHandler = target.comment

1558 if hasattr(target, 'pi'):

1559 parser.ProcessingInstructionHandler = target.pi

1560 # Configure pyexpat: buffering, new-style attribute handling.

1561 parser.buffer_text = 1

1562 parser.ordered_attributes = 1

1563 parser.specified_attributes = 1

1564 self._doctype = None

1565 self.entity = {}

1566 try:

1567 self.version = "Expat %d.%d.%d" % expat.version_info

1568 except AttributeError:

1569 pass # unknown

1570

1571 def _setevents(self, events_queue, events_to_report):

1572 # Internal API for XMLPullParser

1573 # events_to_report: a list of events to report during parsing (same as

1574 # the *events* of XMLPullParser's constructor.

1575 # events_queue: a list of actual parsing events that will be populated

1576 # by the underlying parser.

1577 #

1578 parser = self._parser

1579 append = events_queue.append

1580 for event_name in events_to_report:

1581 if event_name == "start":

1582 parser.ordered_attributes = 1

1583 parser.specified_attributes = 1

1584 def handler(tag, attrib_in, event=event_name, append=append,

1585 start=self._start):

1586 append((event, start(tag, attrib_in)))

1587 parser.StartElementHandler = handler

1588 elif event_name == "end":

1589 def handler(tag, event=event_name, append=append,

1590 end=self._end):

1591 append((event, end(tag)))

1592 parser.EndElementHandler = handler

1593 elif event_name == "start-ns":

1594 # TreeBuilder does not implement .start_ns()

1595 if hasattr(self.target, "start_ns"):

1596 def handler(prefix, uri, event=event_name, append=append,

1597 start_ns=self._start_ns):

1598 append((event, start_ns(prefix, uri)))

1599 else:

1600 def handler(prefix, uri, event=event_name, append=append):

1601 append((event, (prefix or '', uri or '')))

1602 parser.StartNamespaceDeclHandler = handler

1603 elif event_name == "end-ns":

1604 # TreeBuilder does not implement .end_ns()

1605 if hasattr(self.target, "end_ns"):

1606 def handler(prefix, event=event_name, append=append,

1607 end_ns=self._end_ns):

1608 append((event, end_ns(prefix)))

1609 else:

1610 def handler(prefix, event=event_name, append=append):

1611 append((event, None))

1612 parser.EndNamespaceDeclHandler = handler

1613 elif event_name == 'comment':

1614 def handler(text, event=event_name, append=append, self=self):

1615 append((event, self.target.comment(text)))

1616 parser.CommentHandler = handler

1617 elif event_name == 'pi':

1618 def handler(pi_target, data, event=event_name, append=append,

1619 self=self):

1620 append((event, self.target.pi(pi_target, data)))

1621 parser.ProcessingInstructionHandler = handler

1622 else:

1623 raise ValueError("unknown event %r" % event_name)

1624

1625 def _raiseerror(self, value):

1626 err = ParseError(value)

1627 err.code = value.code

1628 err.position = value.lineno, value.offset

1629 raise err

1630

1631 def _fixname(self, key):

1632 # expand qname, and convert name string to ascii, if possible

1633 try:

1634 name = self._names[key]

1635 except KeyError:

1636 name = key

1637 if "}" in name:

1638 name = "{" + name

1639 self._names[key] = name

1640 return name

1641

1642 def _start_ns(self, prefix, uri):

1643 return self.target.start_ns(prefix or '', uri or '')

1644

1645 def _end_ns(self, prefix):

1646 return self.target.end_ns(prefix or '')

1647

1648 def _start(self, tag, attr_list):

1649 # Handler for expat's StartElementHandler. Since ordered_attributes

1650 # is set, the attributes are reported as a list of alternating

1651 # attribute name,value.

1652 fixname = self._fixname

1653 tag = fixname(tag)

1654 attrib = {}

1655 if attr_list:

1656 for i in range(0, len(attr_list), 2):

1657 attrib[fixname(attr_list[i])] = attr_list[i+1]

1658 return self.target.start(tag, attrib)

1659

1660 def _end(self, tag):

1661 return self.target.end(self._fixname(tag))

1662

1663 def _default(self, text):

1664 prefix = text[:1]

1665 if prefix == "&":

1666 # deal with undefined entities

1667 try:

1668 data_handler = self.target.data

1669 except AttributeError:

1670 return

1671 try:

1672 data_handler(self.entity[text[1:-1]])

1673 except KeyError:

1674 from xml.parsers import expat

1675 err = expat.error(

1676 "undefined entity %s: line %d, column %d" %

1677 (text, self.parser.ErrorLineNumber,

1678 self.parser.ErrorColumnNumber)

1679 )

1680 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

1681 err.lineno = self.parser.ErrorLineNumber

1682 err.offset = self.parser.ErrorColumnNumber

1683 raise err

1684 elif prefix == "<" and text[:9] == "<!DOCTYPE":

1685 self._doctype = [] # inside a doctype declaration

1686 elif self._doctype is not None:

1687 # parse doctype contents

1688 if prefix == ">":

1689 self._doctype = None

1690 return

1691 text = text.strip()

1692 if not text:

1693 return

1694 self._doctype.append(text)

1695 n = len(self._doctype)

1696 if n > 2:

1697 type = self._doctype[1]

1698 if type == "PUBLIC" and n == 4:

1699 name, type, pubid, system = self._doctype

1700 if pubid:

1701 pubid = pubid[1:-1]

1702 elif type == "SYSTEM" and n == 3:

1703 name, type, system = self._doctype

1704 pubid = None

1705 else:

1706 return

1707 if hasattr(self.target, "doctype"):

1708 self.target.doctype(name, pubid, system[1:-1])

1709 elif hasattr(self, "doctype"):

1710 warnings.warn(

1711 "The doctype() method of XMLParser is ignored. "

1712 "Define doctype() method on the TreeBuilder target.",

1713 RuntimeWarning)

1714

1715 self._doctype = None

1716

1717 def feed(self, data):

1718 """Feed encoded data to parser."""

1719 try:

1720 self.parser.Parse(data, False)

1721 except self._error as v:

1722 self._raiseerror(v)

1723

1724 def close(self):

1725 """Finish feeding data to parser and return element structure."""

1726 try:

1727 self.parser.Parse(b"", True) # end of data

1728 except self._error as v:

1729 self._raiseerror(v)

1730 try:

1731 close_handler = self.target.close

1732 except AttributeError:

1733 pass

1734 else:

1735 return close_handler()

1736 finally:

1737 # get rid of circular references

1738 del self.parser, self._parser

1739 del self.target, self._target

1740

1741

1742# --------------------------------------------------------------------

1743# C14N 2.0

1744

1745def canonicalize(xml_data=None, *, out=None, from_file=None, **options):

1746 """Convert XML to its C14N 2.0 serialised form.

1747

1748 If *out* is provided, it must be a file or file-like object that receives

1749 the serialised canonical XML output (text, not bytes) through its ``.write()``

1750 method. To write to a file, open it in text mode with encoding "utf-8".

1751 If *out* is not provided, this function returns the output as text string.

1752

1753 Either *xml_data* (an XML string) or *from_file* (a file path or

1754 file-like object) must be provided as input.

1755

1756 The configuration options are the same as for the ``C14NWriterTarget``.

1757 """

1758 if xml_data is None and from_file is None:

1759 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")

1760 sio = None

1761 if out is None:

1762 sio = out = io.StringIO()

1763

1764 parser = XMLParser(target=C14NWriterTarget(out.write, **options))

1765

1766 if xml_data is not None:

1767 parser.feed(xml_data)

1768 parser.close()

1769 elif from_file is not None:

1770 parse(from_file, parser=parser)

1771

1772 return sio.getvalue() if sio is not None else None

1773

1774

1775_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match

1776

1777

1778class C14NWriterTarget:

1779 """

1780 Canonicalization writer target for the XMLParser.

1781

1782 Serialises parse events to XML C14N 2.0.

1783

1784 The *write* function is used for writing out the resulting data stream

1785 as text (not bytes). To write to a file, open it in text mode with encoding

1786 "utf-8" and pass its ``.write`` method.

1787

1788 Configuration options:

1789

1790 - *with_comments*: set to true to include comments

1791 - *strip_text*: set to true to strip whitespace before and after text content

1792 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"

1793 - *qname_aware_tags*: a set of qname aware tag names in which prefixes

1794 should be replaced in text content

1795 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes

1796 should be replaced in text content

1797 - *exclude_attrs*: a set of attribute names that should not be serialised

1798 - *exclude_tags*: a set of tag names that should not be serialised

1799 """

1800 def __init__(self, write, *,

1801 with_comments=False, strip_text=False, rewrite_prefixes=False,

1802 qname_aware_tags=None, qname_aware_attrs=None,

1803 exclude_attrs=None, exclude_tags=None):

1804 self._write = write

1805 self._data = []

1806 self._with_comments = with_comments

1807 self._strip_text = strip_text

1808 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None

1809 self._exclude_tags = set(exclude_tags) if exclude_tags else None

1810

1811 self._rewrite_prefixes = rewrite_prefixes

1812 if qname_aware_tags:

1813 self._qname_aware_tags = set(qname_aware_tags)

1814 else:

1815 self._qname_aware_tags = None

1816 if qname_aware_attrs:

1817 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection

1818 else:

1819 self._find_qname_aware_attrs = None

1820

1821 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.

1822 self._declared_ns_stack = [[

1823 ("http://www.w3.org/XML/1998/namespace", "xml"),

1824 ]]

1825 # Stack with user declared namespace prefixes as (uri, prefix) pairs.

1826 self._ns_stack = []

1827 if not rewrite_prefixes:

1828 self._ns_stack.append(list(_namespace_map.items()))

1829 self._ns_stack.append([])

1830 self._prefix_map = {}

1831 self._preserve_space = [False]

1832 self._pending_start = None

1833 self._root_seen = False

1834 self._root_done = False

1835 self._ignored_depth = 0

1836

1837 def _iter_namespaces(self, ns_stack, _reversed=reversed):

1838 for namespaces in _reversed(ns_stack):

1839 if namespaces: # almost no element declares new namespaces

1840 yield from namespaces

1841

1842 def _resolve_prefix_name(self, prefixed_name):

1843 prefix, name = prefixed_name.split(':', 1)

1844 for uri, p in self._iter_namespaces(self._ns_stack):

1845 if p == prefix:

1846 return f'{{{uri}}}{name}'

1847 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')

1848

1849 def _qname(self, qname, uri=None):

1850 if uri is None:

1851 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)

1852 else:

1853 tag = qname

1854

1855 prefixes_seen = set()

1856 for u, prefix in self._iter_namespaces(self._declared_ns_stack):

1857 if u == uri and prefix not in prefixes_seen:

1858 return f'{prefix}:{tag}' if prefix else tag, tag, uri

1859 prefixes_seen.add(prefix)

1860

1861 # Not declared yet => add new declaration.

1862 if self._rewrite_prefixes:

1863 if uri in self._prefix_map:

1864 prefix = self._prefix_map[uri]

1865 else:

1866 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'

1867 self._declared_ns_stack[-1].append((uri, prefix))

1868 return f'{prefix}:{tag}', tag, uri

1869

1870 if not uri and '' not in prefixes_seen:

1871 # No default namespace declared => no prefix needed.

1872 return tag, tag, uri

1873

1874 for u, prefix in self._iter_namespaces(self._ns_stack):

1875 if u == uri:

1876 self._declared_ns_stack[-1].append((uri, prefix))

1877 return f'{prefix}:{tag}' if prefix else tag, tag, uri

1878

1879 if not uri:

1880 # As soon as a default namespace is defined,

1881 # anything that has no namespace (and thus, no prefix) goes there.

1882 return tag, tag, uri

1883

1884 raise ValueError(f'Namespace "{uri}" is not declared in scope')

1885

1886 def data(self, data):

1887 if not self._ignored_depth:

1888 self._data.append(data)

1889

1890 def _flush(self, _join_text=''.join):

1891 data = _join_text(self._data)

1892 del self._data[:]

1893 if self._strip_text and not self._preserve_space[-1]:

1894 data = data.strip()

1895 if self._pending_start is not None:

1896 args, self._pending_start = self._pending_start, None

1897 qname_text = data if data and _looks_like_prefix_name(data) else None

1898 self._start(*args, qname_text)

1899 if qname_text is not None:

1900 return

1901 if data and self._root_seen:

1902 self._write(_escape_cdata_c14n(data))

1903

1904 def start_ns(self, prefix, uri):

1905 if self._ignored_depth:

1906 return

1907 # we may have to resolve qnames in text content

1908 if self._data:

1909 self._flush()

1910 self._ns_stack[-1].append((uri, prefix))

1911

1912 def start(self, tag, attrs):

1913 if self._exclude_tags is not None and (

1914 self._ignored_depth or tag in self._exclude_tags):

1915 self._ignored_depth += 1

1916 return

1917 if self._data:

1918 self._flush()

1919

1920 new_namespaces = []

1921 self._declared_ns_stack.append(new_namespaces)

1922

1923 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:

1924 # Need to parse text first to see if it requires a prefix declaration.

1925 self._pending_start = (tag, attrs, new_namespaces)

1926 return

1927 self._start(tag, attrs, new_namespaces)

1928

1929 def _start(self, tag, attrs, new_namespaces, qname_text=None):

1930 if self._exclude_attrs is not None and attrs:

1931 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}

1932

1933 qnames = {tag, *attrs}

1934 resolved_names = {}

1935

1936 # Resolve prefixes in attribute and tag text.

1937 if qname_text is not None:

1938 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)

1939 qnames.add(qname)

1940 if self._find_qname_aware_attrs is not None and attrs:

1941 qattrs = self._find_qname_aware_attrs(attrs)

1942 if qattrs:

1943 for attr_name in qattrs:

1944 value = attrs[attr_name]

1945 if _looks_like_prefix_name(value):

1946 qname = resolved_names[value] = self._resolve_prefix_name(value)

1947 qnames.add(qname)

1948 else:

1949 qattrs = None

1950 else:

1951 qattrs = None

1952

1953 # Assign prefixes in lexicographical order of used URIs.

1954 parse_qname = self._qname

1955 parsed_qnames = {n: parse_qname(n) for n in sorted(

1956 qnames, key=lambda n: n.split('}', 1))}

1957

1958 # Write namespace declarations in prefix order ...

1959 if new_namespaces:

1960 attr_list = [

1961 ('xmlns:' + prefix if prefix else 'xmlns', uri)

1962 for uri, prefix in new_namespaces

1963 ]

1964 attr_list.sort()

1965 else:

1966 # almost always empty

1967 attr_list = []

1968

1969 # ... followed by attributes in URI+name order

1970 if attrs:

1971 for k, v in sorted(attrs.items()):

1972 if qattrs is not None and k in qattrs and v in resolved_names:

1973 v = parsed_qnames[resolved_names[v]][0]

1974 attr_qname, attr_name, uri = parsed_qnames[k]

1975 # No prefix for attributes in default ('') namespace.

1976 attr_list.append((attr_qname if uri else attr_name, v))

1977

1978 # Honour xml:space attributes.

1979 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')

1980 self._preserve_space.append(

1981 space_behaviour == 'preserve' if space_behaviour

1982 else self._preserve_space[-1])

1983

1984 # Write the tag.

1985 write = self._write

1986 write('<' + parsed_qnames[tag][0])

1987 if attr_list:

1988 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))

1989 write('>')

1990

1991 # Write the resolved qname text content.

1992 if qname_text is not None:

1993 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))

1994

1995 self._root_seen = True

1996 self._ns_stack.append([])

1997

1998 def end(self, tag):

1999 if self._ignored_depth:

2000 self._ignored_depth -= 1

2001 return

2002 if self._data:

2003 self._flush()

2004 self._write(f'</{self._qname(tag)[0]}>')

2005 self._preserve_space.pop()

2006 self._root_done = len(self._preserve_space) == 1

2007 self._declared_ns_stack.pop()

2008 self._ns_stack.pop()

2009

2010 def comment(self, text):

2011 if not self._with_comments:

2012 return

2013 if self._ignored_depth:

2014 return

2015 if self._root_done:

2016 self._write('\n')

2017 elif self._root_seen and self._data:

2018 self._flush()

2019 self._write(f'')

2020 if not self._root_seen:

2021 self._write('\n')

2022

2023 def pi(self, target, data):

2024 if self._ignored_depth:

2025 return

2026 if self._root_done:

2027 self._write('\n')

2028 elif self._root_seen and self._data:

2029 self._flush()

2030 self._write(

2031 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')

2032 if not self._root_seen:

2033 self._write('\n')

2034

2035

2036def _escape_cdata_c14n(text):

2037 # escape character data

2038 try:

2039 # it's worth avoiding do-nothing calls for strings that are

2040 # shorter than 500 character, or so. assume that's, by far,

2041 # the most common case in most applications.

2042 if '&' in text:

2043 text = text.replace('&', '&')

2044 if '<' in text:

2045 text = text.replace('<', '<')

2046 if '>' in text:

2047 text = text.replace('>', '>')

2048 if '\r' in text:

2049 text = text.replace('\r', '')

2050 return text

2051 except (TypeError, AttributeError):

2052 _raise_serialization_error(text)

2053

2054

2055def _escape_attrib_c14n(text):

2056 # escape attribute value

2057 try:

2058 if '&' in text:

2059 text = text.replace('&', '&')

2060 if '<' in text:

2061 text = text.replace('<', '<')

2062 if '"' in text:

2063 text = text.replace('"', '"')

2064 if '\t' in text:

2065 text = text.replace('\t', '	')

2066 if '\n' in text:

2067 text = text.replace('\n', '
')

2068 if '\r' in text:

2069 text = text.replace('\r', '')

2070 return text

2071 except (TypeError, AttributeError):

2072 _raise_serialization_error(text)

2073

2074

2075# --------------------------------------------------------------------

2076

2077# Import the C accelerators

2078try:

2079 # Element is going to be shadowed by the C implementation. We need to keep

2080 # the Python version of it accessible for some "creative" by external code

2081 # (see tests)

2082 _Element_Py = Element

2083

2084 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories

2085 from _elementtree import *

2086 from _elementtree import _set_factories

2087except ImportError:

2088 pass

2089else:

2090 _set_factories(Comment, ProcessingInstruction)