1# Add x/html serialization to `Elementree`
2# Taken from ElementTree 1.3 preview with slight modifications
3#
4# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
5#
6# fredrik@pythonware.com
7# https://www.pythonware.com/
8#
9# --------------------------------------------------------------------
10# The ElementTree toolkit is
11#
12# Copyright (c) 1999-2007 by Fredrik Lundh
13#
14# By obtaining, using, and/or copying this software and/or its
15# associated documentation, you agree that you have read, understood,
16# and will comply with the following terms and conditions:
17#
18# Permission to use, copy, modify, and distribute this software and
19# its associated documentation for any purpose and without fee is
20# hereby granted, provided that the above copyright notice appears in
21# all copies, and that both that copyright notice and this permission
22# notice appear in supporting documentation, and that the name of
23# Secret Labs AB or the author not be used in advertising or publicity
24# pertaining to distribution of the software without specific, written
25# prior permission.
26#
27# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
28# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
29# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
30# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
31# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
32# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
33# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
34# OF THIS SOFTWARE.
35# --------------------------------------------------------------------
36
37"""
38Python-Markdown provides two serializers which render [`ElementTree.Element`][xml.etree.ElementTree.Element]
39objects to a string of HTML. Both functions wrap the same underlying code with only a few minor
40differences as outlined below:
41
421. Empty (self-closing) tags are rendered as `<tag>` for HTML and as `<tag />` for XHTML.
432. Boolean attributes are rendered as `attrname` for HTML and as `attrname="attrname"` for XHTML.
44"""
45
46from __future__ import annotations
47
48from xml.etree.ElementTree import ProcessingInstruction
49from xml.etree.ElementTree import Comment, ElementTree, Element, QName, HTML_EMPTY
50import re
51from typing import Callable, Literal, NoReturn
52
53__all__ = ['to_html_string', 'to_xhtml_string']
54
55RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I)
56
57
58def _raise_serialization_error(text: str) -> NoReturn: # pragma: no cover
59 raise TypeError(
60 "cannot serialize {!r} (type {})".format(text, type(text).__name__)
61 )
62
63
64def _escape_cdata(text) -> str:
65 # escape character data
66 try:
67 # it's worth avoiding do-nothing calls for strings that are
68 # shorter than 500 character, or so. assume that's, by far,
69 # the most common case in most applications.
70 if "&" in text:
71 # Only replace & when not part of an entity
72 text = RE_AMP.sub('&', text)
73 if "<" in text:
74 text = text.replace("<", "<")
75 if ">" in text:
76 text = text.replace(">", ">")
77 return text
78 except (TypeError, AttributeError): # pragma: no cover
79 _raise_serialization_error(text)
80
81
82def _escape_attrib(text: str) -> str:
83 # escape attribute value
84 try:
85 if "&" in text:
86 # Only replace & when not part of an entity
87 text = RE_AMP.sub('&', text)
88 if "<" in text:
89 text = text.replace("<", "<")
90 if ">" in text:
91 text = text.replace(">", ">")
92 if "\"" in text:
93 text = text.replace("\"", """)
94 if "\n" in text:
95 text = text.replace("\n", " ")
96 return text
97 except (TypeError, AttributeError): # pragma: no cover
98 _raise_serialization_error(text)
99
100
101def _escape_attrib_html(text: str) -> str:
102 # escape attribute value
103 try:
104 if "&" in text:
105 # Only replace & when not part of an entity
106 text = RE_AMP.sub('&', text)
107 if "<" in text:
108 text = text.replace("<", "<")
109 if ">" in text:
110 text = text.replace(">", ">")
111 if "\"" in text:
112 text = text.replace("\"", """)
113 return text
114 except (TypeError, AttributeError): # pragma: no cover
115 _raise_serialization_error(text)
116
117
118def _serialize_html(write: Callable[[str], None], elem: Element, format: Literal["html", "xhtml"]) -> None:
119 tag = elem.tag
120 text = elem.text
121 if tag is Comment:
122 write("<!--%s-->" % _escape_cdata(text))
123 elif tag is ProcessingInstruction:
124 write("<?%s?>" % _escape_cdata(text))
125 elif tag is None:
126 if text:
127 write(_escape_cdata(text))
128 for e in elem:
129 _serialize_html(write, e, format)
130 else:
131 namespace_uri = None
132 if isinstance(tag, QName):
133 # `QNAME` objects store their data as a string: `{uri}tag`
134 if tag.text[:1] == "{":
135 namespace_uri, tag = tag.text[1:].split("}", 1)
136 else:
137 raise ValueError('QName objects must define a tag.')
138 write("<" + tag)
139 items = elem.items()
140 if items:
141 items = sorted(items) # lexical order
142 for k, v in items:
143 if isinstance(k, QName):
144 # Assume a text only `QName`
145 k = k.text
146 if isinstance(v, QName):
147 # Assume a text only `QName`
148 v = v.text
149 else:
150 v = _escape_attrib_html(v)
151 if k == v and format == 'html':
152 # handle boolean attributes
153 write(" %s" % v)
154 else:
155 write(' {}="{}"'.format(k, v))
156 if namespace_uri:
157 write(' xmlns="%s"' % (_escape_attrib(namespace_uri)))
158 if format == "xhtml" and tag.lower() in HTML_EMPTY:
159 write(" />")
160 else:
161 write(">")
162 if text:
163 if tag.lower() in ["script", "style"]:
164 write(text)
165 else:
166 write(_escape_cdata(text))
167 for e in elem:
168 _serialize_html(write, e, format)
169 if tag.lower() not in HTML_EMPTY:
170 write("</" + tag + ">")
171 if elem.tail:
172 write(_escape_cdata(elem.tail))
173
174
175def _write_html(root: Element, format: Literal["html", "xhtml"] = "html") -> str:
176 assert root is not None
177 data: list[str] = []
178 write = data.append
179 _serialize_html(write, root, format)
180 return "".join(data)
181
182
183# --------------------------------------------------------------------
184# public functions
185
186
187def to_html_string(element: Element) -> str:
188 """ Serialize element and its children to a string of HTML5. """
189 return _write_html(ElementTree(element).getroot(), format="html")
190
191
192def to_xhtml_string(element: Element) -> str:
193 """ Serialize element and its children to a string of XHTML. """
194 return _write_html(ElementTree(element).getroot(), format="xhtml")