1"""HTML utilities suitable for global use."""
2
3import html
4import json
5import re
6from collections.abc import Mapping
7from html.parser import HTMLParser
8from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit
9
10from django.core.exceptions import SuspiciousOperation
11from django.utils.encoding import punycode
12from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text
13from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS
14from django.utils.regex_helper import _lazy_re_compile
15from django.utils.safestring import SafeData, SafeString, mark_safe
16from django.utils.text import normalize_newlines
17
18# https://html.spec.whatwg.org/#void-elements
19VOID_ELEMENTS = frozenset(
20 (
21 "area",
22 "base",
23 "br",
24 "col",
25 "embed",
26 "hr",
27 "img",
28 "input",
29 "link",
30 "meta",
31 "param",
32 "source",
33 "track",
34 "wbr",
35 # Deprecated tags.
36 "frame",
37 "spacer",
38 )
39)
40
41MAX_URL_LENGTH = 2048
42MAX_STRIP_TAGS_DEPTH = 50
43
44
45@keep_lazy(SafeString)
46def escape(text):
47 """
48 Return the given text with ampersands, quotes and angle brackets encoded
49 for use in HTML.
50
51 Always escape input, even if it's already escaped and marked as such.
52 This may result in double-escaping. If this is a concern, use
53 conditional_escape() instead.
54 """
55 return SafeString(html.escape(str(text)))
56
57
58_js_escapes = {
59 ord("\\"): "\\u005C",
60 ord("'"): "\\u0027",
61 ord('"'): "\\u0022",
62 ord(">"): "\\u003E",
63 ord("<"): "\\u003C",
64 ord("&"): "\\u0026",
65 ord("="): "\\u003D",
66 ord("-"): "\\u002D",
67 ord(";"): "\\u003B",
68 ord("`"): "\\u0060",
69 ord("\u2028"): "\\u2028",
70 ord("\u2029"): "\\u2029",
71}
72
73# Escape every ASCII character with a value less than 32.
74_js_escapes.update((ord("%c" % z), "\\u%04X" % z) for z in range(32))
75
76
77@keep_lazy(SafeString)
78def escapejs(value):
79 """Hex encode characters for use in JavaScript strings."""
80 return mark_safe(str(value).translate(_js_escapes))
81
82
83_json_script_escapes = {
84 ord(">"): "\\u003E",
85 ord("<"): "\\u003C",
86 ord("&"): "\\u0026",
87}
88
89
90def json_script(value, element_id=None, encoder=None):
91 """
92 Escape all the HTML/XML special characters with their unicode escapes, so
93 value is safe to be output anywhere except for inside a tag attribute. Wrap
94 the escaped JSON in a script tag.
95 """
96 from django.core.serializers.json import DjangoJSONEncoder
97
98 json_str = json.dumps(value, cls=encoder or DjangoJSONEncoder).translate(
99 _json_script_escapes
100 )
101 if element_id:
102 template = '<script id="{}" type="application/json">{}</script>'
103 args = (element_id, mark_safe(json_str))
104 else:
105 template = '<script type="application/json">{}</script>'
106 args = (mark_safe(json_str),)
107 return format_html(template, *args)
108
109
110def conditional_escape(text):
111 """
112 Similar to escape(), except that it doesn't operate on pre-escaped strings.
113
114 This function relies on the __html__ convention used both by Django's
115 SafeData class and by third-party libraries like markupsafe.
116 """
117 if isinstance(text, Promise):
118 text = str(text)
119 if hasattr(text, "__html__"):
120 return text.__html__()
121 else:
122 return escape(text)
123
124
125def format_html(format_string, *args, **kwargs):
126 """
127 Similar to str.format, but pass all arguments through conditional_escape(),
128 and call mark_safe() on the result. This function should be used instead
129 of str.format or % interpolation to build up small HTML fragments.
130 """
131 if not (args or kwargs):
132 raise TypeError("args or kwargs must be provided.")
133 args_safe = map(conditional_escape, args)
134 kwargs_safe = {k: conditional_escape(v) for (k, v) in kwargs.items()}
135 return mark_safe(format_string.format(*args_safe, **kwargs_safe))
136
137
138def format_html_join(sep, format_string, args_generator):
139 """
140 A wrapper of format_html, for the common case of a group of arguments that
141 need to be formatted using the same format string, and then joined using
142 'sep'. 'sep' is also passed through conditional_escape.
143
144 'args_generator' should be an iterator that returns the sequence of 'args'
145 that will be passed to format_html.
146
147 Example:
148
149 format_html_join('\n', "<li>{} {}</li>", ((u.first_name, u.last_name)
150 for u in users))
151 """
152 return mark_safe(
153 conditional_escape(sep).join(
154 (
155 format_html(format_string, **args)
156 if isinstance(args, Mapping)
157 else format_html(format_string, *args)
158 )
159 for args in args_generator
160 )
161 )
162
163
164@keep_lazy_text
165def linebreaks(value, autoescape=False):
166 """Convert newlines into <p> and <br>s."""
167 value = normalize_newlines(value)
168 paras = re.split("\n{2,}", str(value))
169 if autoescape:
170 paras = ["<p>%s</p>" % escape(p).replace("\n", "<br>") for p in paras]
171 else:
172 paras = ["<p>%s</p>" % p.replace("\n", "<br>") for p in paras]
173 return "\n\n".join(paras)
174
175
176class MLStripper(HTMLParser):
177 def __init__(self):
178 super().__init__(convert_charrefs=False)
179 self.reset()
180 self.fed = []
181
182 def handle_data(self, d):
183 self.fed.append(d)
184
185 def handle_entityref(self, name):
186 self.fed.append("&%s;" % name)
187
188 def handle_charref(self, name):
189 self.fed.append("&#%s;" % name)
190
191 def get_data(self):
192 return "".join(self.fed)
193
194
195def _strip_once(value):
196 """
197 Internal tag stripping utility used by strip_tags.
198 """
199 s = MLStripper()
200 s.feed(value)
201 s.close()
202 return s.get_data()
203
204
205@keep_lazy_text
206def strip_tags(value):
207 """Return the given HTML with all tags stripped."""
208 value = str(value)
209 # Note: in typical case this loop executes _strip_once twice (the second
210 # execution does not remove any more tags).
211 strip_tags_depth = 0
212 while "<" in value and ">" in value:
213 if strip_tags_depth >= MAX_STRIP_TAGS_DEPTH:
214 raise SuspiciousOperation
215 new_value = _strip_once(value)
216 if value.count("<") == new_value.count("<"):
217 # _strip_once wasn't able to detect more tags.
218 break
219 value = new_value
220 strip_tags_depth += 1
221 return value
222
223
224@keep_lazy_text
225def strip_spaces_between_tags(value):
226 """Return the given HTML with spaces between tags removed."""
227 return re.sub(r">\s+<", "><", str(value))
228
229
230def smart_urlquote(url):
231 """Quote a URL if it isn't already quoted."""
232
233 def unquote_quote(segment):
234 segment = unquote(segment)
235 # Tilde is part of RFC 3986 Section 2.3 Unreserved Characters,
236 # see also https://bugs.python.org/issue16285
237 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~")
238
239 # Handle IDN before quoting.
240 try:
241 scheme, netloc, path, query, fragment = urlsplit(url)
242 except ValueError:
243 # invalid IPv6 URL (normally square brackets in hostname part).
244 return unquote_quote(url)
245
246 try:
247 netloc = punycode(netloc) # IDN -> ACE
248 except UnicodeError: # invalid domain part
249 return unquote_quote(url)
250
251 if query:
252 # Separately unquoting key/value, so as to not mix querystring separators
253 # included in query values. See #22267.
254 query_parts = [
255 (unquote(q[0]), unquote(q[1]))
256 for q in parse_qsl(query, keep_blank_values=True)
257 ]
258 # urlencode will take care of quoting
259 query = urlencode(query_parts)
260
261 path = unquote_quote(path)
262 fragment = unquote_quote(fragment)
263
264 return urlunsplit((scheme, netloc, path, query, fragment))
265
266
267class CountsDict(dict):
268 def __init__(self, *args, word, **kwargs):
269 super().__init__(*args, *kwargs)
270 self.word = word
271
272 def __missing__(self, key):
273 self[key] = self.word.count(key)
274 return self[key]
275
276
277class Urlizer:
278 """
279 Convert any URLs in text into clickable links.
280
281 Work on http://, https://, www. links, and also on links ending in one of
282 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
283 Links can have trailing punctuation (periods, commas, close-parens) and
284 leading punctuation (opening parens) and it'll still do the right thing.
285 """
286
287 trailing_punctuation_chars = ".,:;!"
288 wrapping_punctuation = [("(", ")"), ("[", "]")]
289
290 simple_url_re = _lazy_re_compile(r"^https?://\[?\w", re.IGNORECASE)
291 simple_url_2_re = _lazy_re_compile(
292 r"^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$", re.IGNORECASE
293 )
294 word_split_re = _lazy_re_compile(r"""([\s<>"']+)""")
295
296 mailto_template = "mailto:{local}@{domain}"
297 url_template = '<a href="{href}"{attrs}>{url}</a>'
298
299 def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
300 """
301 If trim_url_limit is not None, truncate the URLs in the link text
302 longer than this limit to trim_url_limit - 1 characters and append an
303 ellipsis.
304
305 If nofollow is True, give the links a rel="nofollow" attribute.
306
307 If autoescape is True, autoescape the link text and URLs.
308 """
309 safe_input = isinstance(text, SafeData)
310
311 words = self.word_split_re.split(str(text))
312 local_cache = {}
313 urlized_words = []
314 for word in words:
315 if (urlized_word := local_cache.get(word)) is None:
316 urlized_word = self.handle_word(
317 word,
318 safe_input=safe_input,
319 trim_url_limit=trim_url_limit,
320 nofollow=nofollow,
321 autoescape=autoescape,
322 )
323 local_cache[word] = urlized_word
324 urlized_words.append(urlized_word)
325 return "".join(urlized_words)
326
327 def handle_word(
328 self,
329 word,
330 *,
331 safe_input,
332 trim_url_limit=None,
333 nofollow=False,
334 autoescape=False,
335 ):
336 if "." in word or "@" in word or ":" in word:
337 # lead: Punctuation trimmed from the beginning of the word.
338 # middle: State of the word.
339 # trail: Punctuation trimmed from the end of the word.
340 lead, middle, trail = self.trim_punctuation(word)
341 # Make URL we want to point to.
342 url = None
343 nofollow_attr = ' rel="nofollow"' if nofollow else ""
344 if len(middle) <= MAX_URL_LENGTH and self.simple_url_re.match(middle):
345 url = smart_urlquote(html.unescape(middle))
346 elif len(middle) <= MAX_URL_LENGTH and self.simple_url_2_re.match(middle):
347 url = smart_urlquote("http://%s" % html.unescape(middle))
348 elif ":" not in middle and self.is_email_simple(middle):
349 local, domain = middle.rsplit("@", 1)
350 try:
351 domain = punycode(domain)
352 except UnicodeError:
353 return word
354 local = quote(local, safe="")
355 domain = quote(domain, safe="")
356 url = self.mailto_template.format(local=local, domain=domain)
357 nofollow_attr = ""
358 # Make link.
359 if url:
360 trimmed = self.trim_url(middle, limit=trim_url_limit)
361 if autoescape and not safe_input:
362 lead, trail = escape(lead), escape(trail)
363 trimmed = escape(trimmed)
364 middle = self.url_template.format(
365 href=escape(url),
366 attrs=nofollow_attr,
367 url=trimmed,
368 )
369 return mark_safe(f"{lead}{middle}{trail}")
370 else:
371 if safe_input:
372 return mark_safe(word)
373 elif autoescape:
374 return escape(word)
375 elif safe_input:
376 return mark_safe(word)
377 elif autoescape:
378 return escape(word)
379 return word
380
381 def trim_url(self, x, *, limit):
382 if limit is None or len(x) <= limit:
383 return x
384 return "%s…" % x[: max(0, limit - 1)]
385
386 @cached_property
387 def wrapping_punctuation_openings(self):
388 return "".join(dict(self.wrapping_punctuation).keys())
389
390 @cached_property
391 def trailing_punctuation_chars_no_semicolon(self):
392 return self.trailing_punctuation_chars.replace(";", "")
393
394 @cached_property
395 def trailing_punctuation_chars_has_semicolon(self):
396 return ";" in self.trailing_punctuation_chars
397
398 def trim_punctuation(self, word):
399 """
400 Trim trailing and wrapping punctuation from `word`. Return the items of
401 the new state.
402 """
403 # Strip all opening wrapping punctuation.
404 middle = word.lstrip(self.wrapping_punctuation_openings)
405 lead = word[: len(word) - len(middle)]
406 trail = ""
407
408 # Continue trimming until middle remains unchanged.
409 trimmed_something = True
410 counts = CountsDict(word=middle)
411 while trimmed_something and middle:
412 trimmed_something = False
413 # Trim wrapping punctuation.
414 for opening, closing in self.wrapping_punctuation:
415 if counts[opening] < counts[closing]:
416 rstripped = middle.rstrip(closing)
417 if rstripped != middle:
418 strip = counts[closing] - counts[opening]
419 trail = middle[-strip:]
420 middle = middle[:-strip]
421 trimmed_something = True
422 counts[closing] -= strip
423
424 amp = middle.rfind("&")
425 if amp == -1:
426 rstripped = middle.rstrip(self.trailing_punctuation_chars)
427 else:
428 rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon)
429 if rstripped != middle:
430 trail = middle[len(rstripped) :] + trail
431 middle = rstripped
432 trimmed_something = True
433
434 if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"):
435 # Only strip if not part of an HTML entity.
436 potential_entity = middle[amp:]
437 escaped = html.unescape(potential_entity)
438 if escaped == potential_entity or escaped.endswith(";"):
439 rstripped = middle.rstrip(self.trailing_punctuation_chars)
440 trail_start = len(rstripped)
441 amount_trailing_semicolons = len(middle) - len(middle.rstrip(";"))
442 if amp > -1 and amount_trailing_semicolons > 1:
443 # Leave up to most recent semicolon as might be an entity.
444 recent_semicolon = middle[trail_start:].index(";")
445 middle_semicolon_index = recent_semicolon + trail_start + 1
446 trail = middle[middle_semicolon_index:] + trail
447 middle = rstripped + middle[trail_start:middle_semicolon_index]
448 else:
449 trail = middle[trail_start:] + trail
450 middle = rstripped
451 trimmed_something = True
452
453 return lead, middle, trail
454
455 @staticmethod
456 def is_email_simple(value):
457 """Return True if value looks like an email address."""
458 # An @ must be in the middle of the value.
459 if "@" not in value or value.startswith("@") or value.endswith("@"):
460 return False
461 try:
462 p1, p2 = value.split("@")
463 except ValueError:
464 # value contains more than one @.
465 return False
466 # Max length for domain name labels is 63 characters per RFC 1034.
467 # Helps to avoid ReDoS vectors in the domain part.
468 if len(p2) > 63:
469 return False
470 # Dot must be in p2 (e.g. example.com)
471 if "." not in p2 or p2.startswith("."):
472 return False
473 return True
474
475
476urlizer = Urlizer()
477
478
479@keep_lazy_text
480def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
481 return urlizer(
482 text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape
483 )
484
485
486def avoid_wrapping(value):
487 """
488 Avoid text wrapping in the middle of a phrase by adding non-breaking
489 spaces where there previously were normal spaces.
490 """
491 return value.replace(" ", "\xa0")
492
493
494def html_safe(klass):
495 """
496 A decorator that defines the __html__ method. This helps non-Django
497 templates to detect classes whose __str__ methods return SafeString.
498 """
499 if "__html__" in klass.__dict__:
500 raise ValueError(
501 "can't apply @html_safe to %s because it defines "
502 "__html__()." % klass.__name__
503 )
504 if "__str__" not in klass.__dict__:
505 raise ValueError(
506 "can't apply @html_safe to %s because it doesn't "
507 "define __str__()." % klass.__name__
508 )
509 klass_str = klass.__str__
510 klass.__str__ = lambda self: mark_safe(klass_str(self))
511 klass.__html__ = lambda self: str(self)
512 return klass