Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/utils/html.py: 31%
194 statements
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
1"""HTML utilities suitable for global use."""
3import html
4import json
5import re
6from html.parser import HTMLParser
7from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit
9from django.utils.encoding import punycode
10from django.utils.functional import Promise, keep_lazy, keep_lazy_text
11from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS
12from django.utils.regex_helper import _lazy_re_compile
13from django.utils.safestring import SafeData, SafeString, mark_safe
14from django.utils.text import normalize_newlines
17@keep_lazy(SafeString)
18def escape(text):
19 """
20 Return the given text with ampersands, quotes and angle brackets encoded
21 for use in HTML.
23 Always escape input, even if it's already escaped and marked as such.
24 This may result in double-escaping. If this is a concern, use
25 conditional_escape() instead.
26 """
27 return SafeString(html.escape(str(text)))
30_js_escapes = {
31 ord("\\"): "\\u005C",
32 ord("'"): "\\u0027",
33 ord('"'): "\\u0022",
34 ord(">"): "\\u003E",
35 ord("<"): "\\u003C",
36 ord("&"): "\\u0026",
37 ord("="): "\\u003D",
38 ord("-"): "\\u002D",
39 ord(";"): "\\u003B",
40 ord("`"): "\\u0060",
41 ord("\u2028"): "\\u2028",
42 ord("\u2029"): "\\u2029",
43}
45# Escape every ASCII character with a value less than 32.
46_js_escapes.update((ord("%c" % z), "\\u%04X" % z) for z in range(32))
49@keep_lazy(SafeString)
50def escapejs(value):
51 """Hex encode characters for use in JavaScript strings."""
52 return mark_safe(str(value).translate(_js_escapes))
55_json_script_escapes = {
56 ord(">"): "\\u003E",
57 ord("<"): "\\u003C",
58 ord("&"): "\\u0026",
59}
62def json_script(value, element_id=None, encoder=None):
63 """
64 Escape all the HTML/XML special characters with their unicode escapes, so
65 value is safe to be output anywhere except for inside a tag attribute. Wrap
66 the escaped JSON in a script tag.
67 """
68 from django.core.serializers.json import DjangoJSONEncoder
70 json_str = json.dumps(value, cls=encoder or DjangoJSONEncoder).translate(
71 _json_script_escapes
72 )
73 if element_id:
74 template = '<script id="{}" type="application/json">{}</script>'
75 args = (element_id, mark_safe(json_str))
76 else:
77 template = '<script type="application/json">{}</script>'
78 args = (mark_safe(json_str),)
79 return format_html(template, *args)
82def conditional_escape(text):
83 """
84 Similar to escape(), except that it doesn't operate on pre-escaped strings.
86 This function relies on the __html__ convention used both by Django's
87 SafeData class and by third-party libraries like markupsafe.
88 """
89 if isinstance(text, Promise):
90 text = str(text)
91 if hasattr(text, "__html__"):
92 return text.__html__()
93 else:
94 return escape(text)
97def format_html(format_string, *args, **kwargs):
98 """
99 Similar to str.format, but pass all arguments through conditional_escape(),
100 and call mark_safe() on the result. This function should be used instead
101 of str.format or % interpolation to build up small HTML fragments.
102 """
103 args_safe = map(conditional_escape, args)
104 kwargs_safe = {k: conditional_escape(v) for (k, v) in kwargs.items()}
105 return mark_safe(format_string.format(*args_safe, **kwargs_safe))
108def format_html_join(sep, format_string, args_generator):
109 """
110 A wrapper of format_html, for the common case of a group of arguments that
111 need to be formatted using the same format string, and then joined using
112 'sep'. 'sep' is also passed through conditional_escape.
114 'args_generator' should be an iterator that returns the sequence of 'args'
115 that will be passed to format_html.
117 Example:
119 format_html_join('\n', "<li>{} {}</li>", ((u.first_name, u.last_name)
120 for u in users))
121 """
122 return mark_safe(
123 conditional_escape(sep).join(
124 format_html(format_string, *args) for args in args_generator
125 )
126 )
129@keep_lazy_text
130def linebreaks(value, autoescape=False):
131 """Convert newlines into <p> and <br>s."""
132 value = normalize_newlines(value)
133 paras = re.split("\n{2,}", str(value))
134 if autoescape:
135 paras = ["<p>%s</p>" % escape(p).replace("\n", "<br>") for p in paras]
136 else:
137 paras = ["<p>%s</p>" % p.replace("\n", "<br>") for p in paras]
138 return "\n\n".join(paras)
141class MLStripper(HTMLParser):
142 def __init__(self):
143 super().__init__(convert_charrefs=False)
144 self.reset()
145 self.fed = []
147 def handle_data(self, d):
148 self.fed.append(d)
150 def handle_entityref(self, name):
151 self.fed.append("&%s;" % name)
153 def handle_charref(self, name):
154 self.fed.append("&#%s;" % name)
156 def get_data(self):
157 return "".join(self.fed)
160def _strip_once(value):
161 """
162 Internal tag stripping utility used by strip_tags.
163 """
164 s = MLStripper()
165 s.feed(value)
166 s.close()
167 return s.get_data()
170@keep_lazy_text
171def strip_tags(value):
172 """Return the given HTML with all tags stripped."""
173 # Note: in typical case this loop executes _strip_once once. Loop condition
174 # is redundant, but helps to reduce number of executions of _strip_once.
175 value = str(value)
176 while "<" in value and ">" in value:
177 new_value = _strip_once(value)
178 if value.count("<") == new_value.count("<"):
179 # _strip_once wasn't able to detect more tags.
180 break
181 value = new_value
182 return value
185@keep_lazy_text
186def strip_spaces_between_tags(value):
187 """Return the given HTML with spaces between tags removed."""
188 return re.sub(r">\s+<", "><", str(value))
191def smart_urlquote(url):
192 """Quote a URL if it isn't already quoted."""
194 def unquote_quote(segment):
195 segment = unquote(segment)
196 # Tilde is part of RFC 3986 Section 2.3 Unreserved Characters,
197 # see also https://bugs.python.org/issue16285
198 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~")
200 # Handle IDN before quoting.
201 try:
202 scheme, netloc, path, query, fragment = urlsplit(url)
203 except ValueError:
204 # invalid IPv6 URL (normally square brackets in hostname part).
205 return unquote_quote(url)
207 try:
208 netloc = punycode(netloc) # IDN -> ACE
209 except UnicodeError: # invalid domain part
210 return unquote_quote(url)
212 if query:
213 # Separately unquoting key/value, so as to not mix querystring separators
214 # included in query values. See #22267.
215 query_parts = [
216 (unquote(q[0]), unquote(q[1]))
217 for q in parse_qsl(query, keep_blank_values=True)
218 ]
219 # urlencode will take care of quoting
220 query = urlencode(query_parts)
222 path = unquote_quote(path)
223 fragment = unquote_quote(fragment)
225 return urlunsplit((scheme, netloc, path, query, fragment))
228class Urlizer:
229 """
230 Convert any URLs in text into clickable links.
232 Work on http://, https://, www. links, and also on links ending in one of
233 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
234 Links can have trailing punctuation (periods, commas, close-parens) and
235 leading punctuation (opening parens) and it'll still do the right thing.
236 """
238 trailing_punctuation_chars = ".,:;!"
239 wrapping_punctuation = [("(", ")"), ("[", "]")]
241 simple_url_re = _lazy_re_compile(r"^https?://\[?\w", re.IGNORECASE)
242 simple_url_2_re = _lazy_re_compile(
243 r"^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$", re.IGNORECASE
244 )
245 word_split_re = _lazy_re_compile(r"""([\s<>"']+)""")
247 mailto_template = "mailto:{local}@{domain}"
248 url_template = '<a href="{href}"{attrs}>{url}</a>'
250 def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
251 """
252 If trim_url_limit is not None, truncate the URLs in the link text
253 longer than this limit to trim_url_limit - 1 characters and append an
254 ellipsis.
256 If nofollow is True, give the links a rel="nofollow" attribute.
258 If autoescape is True, autoescape the link text and URLs.
259 """
260 safe_input = isinstance(text, SafeData)
262 words = self.word_split_re.split(str(text))
263 return "".join(
264 [
265 self.handle_word(
266 word,
267 safe_input=safe_input,
268 trim_url_limit=trim_url_limit,
269 nofollow=nofollow,
270 autoescape=autoescape,
271 )
272 for word in words
273 ]
274 )
276 def handle_word(
277 self,
278 word,
279 *,
280 safe_input,
281 trim_url_limit=None,
282 nofollow=False,
283 autoescape=False,
284 ):
285 if "." in word or "@" in word or ":" in word:
286 # lead: Punctuation trimmed from the beginning of the word.
287 # middle: State of the word.
288 # trail: Punctuation trimmed from the end of the word.
289 lead, middle, trail = self.trim_punctuation(word)
290 # Make URL we want to point to.
291 url = None
292 nofollow_attr = ' rel="nofollow"' if nofollow else ""
293 if self.simple_url_re.match(middle):
294 url = smart_urlquote(html.unescape(middle))
295 elif self.simple_url_2_re.match(middle):
296 url = smart_urlquote("http://%s" % html.unescape(middle))
297 elif ":" not in middle and self.is_email_simple(middle):
298 local, domain = middle.rsplit("@", 1)
299 try:
300 domain = punycode(domain)
301 except UnicodeError:
302 return word
303 url = self.mailto_template.format(local=local, domain=domain)
304 nofollow_attr = ""
305 # Make link.
306 if url:
307 trimmed = self.trim_url(middle, limit=trim_url_limit)
308 if autoescape and not safe_input:
309 lead, trail = escape(lead), escape(trail)
310 trimmed = escape(trimmed)
311 middle = self.url_template.format(
312 href=escape(url),
313 attrs=nofollow_attr,
314 url=trimmed,
315 )
316 return mark_safe(f"{lead}{middle}{trail}")
317 else:
318 if safe_input:
319 return mark_safe(word)
320 elif autoescape:
321 return escape(word)
322 elif safe_input:
323 return mark_safe(word)
324 elif autoescape:
325 return escape(word)
326 return word
328 def trim_url(self, x, *, limit):
329 if limit is None or len(x) <= limit:
330 return x
331 return "%s…" % x[: max(0, limit - 1)]
333 def trim_punctuation(self, word):
334 """
335 Trim trailing and wrapping punctuation from `word`. Return the items of
336 the new state.
337 """
338 lead, middle, trail = "", word, ""
339 # Continue trimming until middle remains unchanged.
340 trimmed_something = True
341 while trimmed_something:
342 trimmed_something = False
343 # Trim wrapping punctuation.
344 for opening, closing in self.wrapping_punctuation:
345 if middle.startswith(opening):
346 middle = middle[len(opening) :]
347 lead += opening
348 trimmed_something = True
349 # Keep parentheses at the end only if they're balanced.
350 if (
351 middle.endswith(closing)
352 and middle.count(closing) == middle.count(opening) + 1
353 ):
354 middle = middle[: -len(closing)]
355 trail = closing + trail
356 trimmed_something = True
357 # Trim trailing punctuation (after trimming wrapping punctuation,
358 # as encoded entities contain ';'). Unescape entities to avoid
359 # breaking them by removing ';'.
360 middle_unescaped = html.unescape(middle)
361 stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
362 if middle_unescaped != stripped:
363 punctuation_count = len(middle_unescaped) - len(stripped)
364 trail = middle[-punctuation_count:] + trail
365 middle = middle[:-punctuation_count]
366 trimmed_something = True
367 return lead, middle, trail
369 @staticmethod
370 def is_email_simple(value):
371 """Return True if value looks like an email address."""
372 # An @ must be in the middle of the value.
373 if "@" not in value or value.startswith("@") or value.endswith("@"):
374 return False
375 try:
376 p1, p2 = value.split("@")
377 except ValueError:
378 # value contains more than one @.
379 return False
380 # Dot must be in p2 (e.g. example.com)
381 if "." not in p2 or p2.startswith("."):
382 return False
383 return True
386urlizer = Urlizer()
389@keep_lazy_text
390def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
391 return urlizer(
392 text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape
393 )
396def avoid_wrapping(value):
397 """
398 Avoid text wrapping in the middle of a phrase by adding non-breaking
399 spaces where there previously were normal spaces.
400 """
401 return value.replace(" ", "\xa0")
404def html_safe(klass):
405 """
406 A decorator that defines the __html__ method. This helps non-Django
407 templates to detect classes whose __str__ methods return SafeString.
408 """
409 if "__html__" in klass.__dict__:
410 raise ValueError(
411 "can't apply @html_safe to %s because it defines "
412 "__html__()." % klass.__name__
413 )
414 if "__str__" not in klass.__dict__:
415 raise ValueError(
416 "can't apply @html_safe to %s because it doesn't "
417 "define __str__()." % klass.__name__
418 )
419 klass_str = klass.__str__
420 klass.__str__ = lambda self: mark_safe(klass_str(self))
421 klass.__html__ = lambda self: str(self)
422 return klass