Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/django/utils/html.py: 63%

1"""HTML utilities suitable for global use."""

3import html

4import json

5import re

6from collections.abc import Mapping

7from html.parser import HTMLParser

8from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit

10from django.core.exceptions import SuspiciousOperation

11from django.utils.encoding import punycode

12from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text

13from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS

14from django.utils.regex_helper import _lazy_re_compile

15from django.utils.safestring import SafeData, SafeString, mark_safe

16from django.utils.text import normalize_newlines

18# https://html.spec.whatwg.org/#void-elements

19VOID_ELEMENTS = frozenset(

20 (

21 "area",

22 "base",

23 "br",

24 "col",

25 "embed",

26 "hr",

27 "img",

28 "input",

29 "link",

30 "meta",

31 "param",

32 "source",

33 "track",

34 "wbr",

35 # Deprecated tags.

36 "frame",

37 "spacer",

38 )

39)

41MAX_URL_LENGTH = 2048

42MAX_STRIP_TAGS_DEPTH = 50

45@keep_lazy(SafeString)

46def escape(text):

47 """

48 Return the given text with ampersands, quotes and angle brackets encoded

49 for use in HTML.

51 Always escape input, even if it's already escaped and marked as such.

52 This may result in double-escaping. If this is a concern, use

53 conditional_escape() instead.

54 """

55 return SafeString(html.escape(str(text)))

58_js_escapes = {

59 ord("\\"): "\\u005C",

60 ord("'"): "\\u0027",

61 ord('"'): "\\u0022",

62 ord(">"): "\\u003E",

63 ord("<"): "\\u003C",

64 ord("&"): "\\u0026",

65 ord("="): "\\u003D",

66 ord("-"): "\\u002D",

67 ord(";"): "\\u003B",

68 ord("`"): "\\u0060",

69 ord("\u2028"): "\\u2028",

70 ord("\u2029"): "\\u2029",

71}

73# Escape every ASCII character with a value less than 32.

74_js_escapes.update((ord("%c" % z), "\\u%04X" % z) for z in range(32))

77@keep_lazy(SafeString)

78def escapejs(value):

79 """Hex encode characters for use in JavaScript strings."""

80 return mark_safe(str(value).translate(_js_escapes))

83_json_script_escapes = {

84 ord(">"): "\\u003E",

85 ord("<"): "\\u003C",

86 ord("&"): "\\u0026",

87}

90def json_script(value, element_id=None, encoder=None):

91 """

92 Escape all the HTML/XML special characters with their unicode escapes, so

93 value is safe to be output anywhere except for inside a tag attribute. Wrap

94 the escaped JSON in a script tag.

95 """

96 from django.core.serializers.json import DjangoJSONEncoder

98 json_str = json.dumps(value, cls=encoder or DjangoJSONEncoder).translate(

99 _json_script_escapes

100 )

101 if element_id:

102 template = '<script id="{}" type="application/json">{}</script>'

103 args = (element_id, mark_safe(json_str))

104 else:

105 template = '<script type="application/json">{}</script>'

106 args = (mark_safe(json_str),)

107 return format_html(template, *args)

108

109

110def conditional_escape(text):

111 """

112 Similar to escape(), except that it doesn't operate on pre-escaped strings.

113

114 This function relies on the __html__ convention used both by Django's

115 SafeData class and by third-party libraries like markupsafe.

116 """

117 if isinstance(text, Promise):

118 text = str(text)

119 if hasattr(text, "__html__"):

120 return text.__html__()

121 else:

122 return escape(text)

123

124

125def format_html(format_string, *args, **kwargs):

126 """

127 Similar to str.format, but pass all arguments through conditional_escape(),

128 and call mark_safe() on the result. This function should be used instead

129 of str.format or % interpolation to build up small HTML fragments.

130 """

131 if not (args or kwargs):

132 raise TypeError("args or kwargs must be provided.")

133 args_safe = map(conditional_escape, args)

134 kwargs_safe = {k: conditional_escape(v) for (k, v) in kwargs.items()}

135 return mark_safe(format_string.format(*args_safe, **kwargs_safe))

136

137

138def format_html_join(sep, format_string, args_generator):

139 """

140 A wrapper of format_html, for the common case of a group of arguments that

141 need to be formatted using the same format string, and then joined using

142 'sep'. 'sep' is also passed through conditional_escape.

143

144 'args_generator' should be an iterator that returns the sequence of 'args'

145 that will be passed to format_html.

146

147 Example:

148

149 format_html_join('\n', "<li>{} {}</li>", ((u.first_name, u.last_name)

150 for u in users))

151 """

152 return mark_safe(

153 conditional_escape(sep).join(

154 (

155 format_html(format_string, **args)

156 if isinstance(args, Mapping)

157 else format_html(format_string, *args)

158 )

159 for args in args_generator

160 )

161 )

162

163

164@keep_lazy_text

165def linebreaks(value, autoescape=False):

166 """Convert newlines into <p> and <br>s."""

167 value = normalize_newlines(value)

168 paras = re.split("\n{2,}", str(value))

169 if autoescape:

170 paras = ["<p>%s</p>" % escape(p).replace("\n", "<br>") for p in paras]

171 else:

172 paras = ["<p>%s</p>" % p.replace("\n", "<br>") for p in paras]

173 return "\n\n".join(paras)

174

175

176class MLStripper(HTMLParser):

177 def __init__(self):

178 super().__init__(convert_charrefs=False)

179 self.reset()

180 self.fed = []

181

182 def handle_data(self, d):

183 self.fed.append(d)

184

185 def handle_entityref(self, name):

186 self.fed.append("&%s;" % name)

187

188 def handle_charref(self, name):

189 self.fed.append("&#%s;" % name)

190

191 def get_data(self):

192 return "".join(self.fed)

193

194

195def _strip_once(value):

196 """

197 Internal tag stripping utility used by strip_tags.

198 """

199 s = MLStripper()

200 s.feed(value)

201 s.close()

202 return s.get_data()

203

204

205@keep_lazy_text

206def strip_tags(value):

207 """Return the given HTML with all tags stripped."""

208 value = str(value)

209 # Note: in typical case this loop executes _strip_once twice (the second

210 # execution does not remove any more tags).

211 strip_tags_depth = 0

212 while "<" in value and ">" in value:

213 if strip_tags_depth >= MAX_STRIP_TAGS_DEPTH:

214 raise SuspiciousOperation

215 new_value = _strip_once(value)

216 if value.count("<") == new_value.count("<"):

217 # _strip_once wasn't able to detect more tags.

218 break

219 value = new_value

220 strip_tags_depth += 1

221 return value

222

223

224@keep_lazy_text

225def strip_spaces_between_tags(value):

226 """Return the given HTML with spaces between tags removed."""

227 return re.sub(r">\s+<", "><", str(value))

228

229

230def smart_urlquote(url):

231 """Quote a URL if it isn't already quoted."""

232

233 def unquote_quote(segment):

234 segment = unquote(segment)

235 # Tilde is part of RFC 3986 Section 2.3 Unreserved Characters,

236 # see also https://bugs.python.org/issue16285

237 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~")

238

239 # Handle IDN before quoting.

240 try:

241 scheme, netloc, path, query, fragment = urlsplit(url)

242 except ValueError:

243 # invalid IPv6 URL (normally square brackets in hostname part).

244 return unquote_quote(url)

245

246 try:

247 netloc = punycode(netloc) # IDN -> ACE

248 except UnicodeError: # invalid domain part

249 return unquote_quote(url)

250

251 if query:

252 # Separately unquoting key/value, so as to not mix querystring separators

253 # included in query values. See #22267.

254 query_parts = [

255 (unquote(q[0]), unquote(q[1]))

256 for q in parse_qsl(query, keep_blank_values=True)

257 ]

258 # urlencode will take care of quoting

259 query = urlencode(query_parts)

260

261 path = unquote_quote(path)

262 fragment = unquote_quote(fragment)

263

264 return urlunsplit((scheme, netloc, path, query, fragment))

265

266

267class CountsDict(dict):

268 def __init__(self, *args, word, **kwargs):

269 super().__init__(*args, *kwargs)

270 self.word = word

271

272 def __missing__(self, key):

273 self[key] = self.word.count(key)

274 return self[key]

275

276

277class Urlizer:

278 """

279 Convert any URLs in text into clickable links.

280

281 Work on http://, https://, www. links, and also on links ending in one of

282 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).

283 Links can have trailing punctuation (periods, commas, close-parens) and

284 leading punctuation (opening parens) and it'll still do the right thing.

285 """

286

287 trailing_punctuation_chars = ".,:;!"

288 wrapping_punctuation = [("(", ")"), ("[", "]")]

289

290 simple_url_re = _lazy_re_compile(r"^https?://\[?\w", re.IGNORECASE)

291 simple_url_2_re = _lazy_re_compile(

292 r"^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$", re.IGNORECASE

293 )

294 word_split_re = _lazy_re_compile(r"""([\s<>"']+)""")

295

296 mailto_template = "mailto:{local}@{domain}"

297 url_template = '<a href="{href}"{attrs}>{url}</a>'

298

299 def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):

300 """

301 If trim_url_limit is not None, truncate the URLs in the link text

302 longer than this limit to trim_url_limit - 1 characters and append an

303 ellipsis.

304

305 If nofollow is True, give the links a rel="nofollow" attribute.

306

307 If autoescape is True, autoescape the link text and URLs.

308 """

309 safe_input = isinstance(text, SafeData)

310

311 words = self.word_split_re.split(str(text))

312 local_cache = {}

313 urlized_words = []

314 for word in words:

315 if (urlized_word := local_cache.get(word)) is None:

316 urlized_word = self.handle_word(

317 word,

318 safe_input=safe_input,

319 trim_url_limit=trim_url_limit,

320 nofollow=nofollow,

321 autoescape=autoescape,

322 )

323 local_cache[word] = urlized_word

324 urlized_words.append(urlized_word)

325 return "".join(urlized_words)

326

327 def handle_word(

328 self,

329 word,

330 *,

331 safe_input,

332 trim_url_limit=None,

333 nofollow=False,

334 autoescape=False,

335 ):

336 if "." in word or "@" in word or ":" in word:

337 # lead: Punctuation trimmed from the beginning of the word.

338 # middle: State of the word.

339 # trail: Punctuation trimmed from the end of the word.

340 lead, middle, trail = self.trim_punctuation(word)

341 # Make URL we want to point to.

342 url = None

343 nofollow_attr = ' rel="nofollow"' if nofollow else ""

344 if len(middle) <= MAX_URL_LENGTH and self.simple_url_re.match(middle):

345 url = smart_urlquote(html.unescape(middle))

346 elif len(middle) <= MAX_URL_LENGTH and self.simple_url_2_re.match(middle):

347 url = smart_urlquote("http://%s" % html.unescape(middle))

348 elif ":" not in middle and self.is_email_simple(middle):

349 local, domain = middle.rsplit("@", 1)

350 try:

351 domain = punycode(domain)

352 except UnicodeError:

353 return word

354 local = quote(local, safe="")

355 domain = quote(domain, safe="")

356 url = self.mailto_template.format(local=local, domain=domain)

357 nofollow_attr = ""

358 # Make link.

359 if url:

360 trimmed = self.trim_url(middle, limit=trim_url_limit)

361 if autoescape and not safe_input:

362 lead, trail = escape(lead), escape(trail)

363 trimmed = escape(trimmed)

364 middle = self.url_template.format(

365 href=escape(url),

366 attrs=nofollow_attr,

367 url=trimmed,

368 )

369 return mark_safe(f"{lead}{middle}{trail}")

370 else:

371 if safe_input:

372 return mark_safe(word)

373 elif autoescape:

374 return escape(word)

375 elif safe_input:

376 return mark_safe(word)

377 elif autoescape:

378 return escape(word)

379 return word

380

381 def trim_url(self, x, *, limit):

382 if limit is None or len(x) <= limit:

383 return x

384 return "%s…" % x[: max(0, limit - 1)]

385

386 @cached_property

387 def wrapping_punctuation_openings(self):

388 return "".join(dict(self.wrapping_punctuation).keys())

389

390 @cached_property

391 def trailing_punctuation_chars_no_semicolon(self):

392 return self.trailing_punctuation_chars.replace(";", "")

393

394 @cached_property

395 def trailing_punctuation_chars_has_semicolon(self):

396 return ";" in self.trailing_punctuation_chars

397

398 def trim_punctuation(self, word):

399 """

400 Trim trailing and wrapping punctuation from `word`. Return the items of

401 the new state.

402 """

403 # Strip all opening wrapping punctuation.

404 middle = word.lstrip(self.wrapping_punctuation_openings)

405 lead = word[: len(word) - len(middle)]

406 trail = ""

407

408 # Continue trimming until middle remains unchanged.

409 trimmed_something = True

410 counts = CountsDict(word=middle)

411 while trimmed_something and middle:

412 trimmed_something = False

413 # Trim wrapping punctuation.

414 for opening, closing in self.wrapping_punctuation:

415 if counts[opening] < counts[closing]:

416 rstripped = middle.rstrip(closing)

417 if rstripped != middle:

418 strip = counts[closing] - counts[opening]

419 trail = middle[-strip:]

420 middle = middle[:-strip]

421 trimmed_something = True

422 counts[closing] -= strip

423

424 amp = middle.rfind("&")

425 if amp == -1:

426 rstripped = middle.rstrip(self.trailing_punctuation_chars)

427 else:

428 rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon)

429 if rstripped != middle:

430 trail = middle[len(rstripped) :] + trail

431 middle = rstripped

432 trimmed_something = True

433

434 if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"):

435 # Only strip if not part of an HTML entity.

436 potential_entity = middle[amp:]

437 escaped = html.unescape(potential_entity)

438 if escaped == potential_entity or escaped.endswith(";"):

439 rstripped = middle.rstrip(self.trailing_punctuation_chars)

440 trail_start = len(rstripped)

441 amount_trailing_semicolons = len(middle) - len(middle.rstrip(";"))

442 if amp > -1 and amount_trailing_semicolons > 1:

443 # Leave up to most recent semicolon as might be an entity.

444 recent_semicolon = middle[trail_start:].index(";")

445 middle_semicolon_index = recent_semicolon + trail_start + 1

446 trail = middle[middle_semicolon_index:] + trail

447 middle = rstripped + middle[trail_start:middle_semicolon_index]

448 else:

449 trail = middle[trail_start:] + trail

450 middle = rstripped

451 trimmed_something = True

452

453 return lead, middle, trail

454

455 @staticmethod

456 def is_email_simple(value):

457 """Return True if value looks like an email address."""

458 # An @ must be in the middle of the value.

459 if "@" not in value or value.startswith("@") or value.endswith("@"):

460 return False

461 try:

462 p1, p2 = value.split("@")

463 except ValueError:

464 # value contains more than one @.

465 return False

466 # Max length for domain name labels is 63 characters per RFC 1034.

467 # Helps to avoid ReDoS vectors in the domain part.

468 if len(p2) > 63:

469 return False

470 # Dot must be in p2 (e.g. example.com)

471 if "." not in p2 or p2.startswith("."):

472 return False

473 return True

474

475

476urlizer = Urlizer()

477

478

479@keep_lazy_text

480def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):

481 return urlizer(

482 text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape

483 )

484

485

486def avoid_wrapping(value):

487 """

488 Avoid text wrapping in the middle of a phrase by adding non-breaking

489 spaces where there previously were normal spaces.

490 """

491 return value.replace(" ", "\xa0")

492

493

494def html_safe(klass):

495 """

496 A decorator that defines the __html__ method. This helps non-Django

497 templates to detect classes whose __str__ methods return SafeString.

498 """

499 if "__html__" in klass.__dict__:

500 raise ValueError(

501 "can't apply @html_safe to %s because it defines "

502 "__html__()." % klass.__name__

503 )

504 if "__str__" not in klass.__dict__:

505 raise ValueError(

506 "can't apply @html_safe to %s because it doesn't "

507 "define __str__()." % klass.__name__

508 )

509 klass_str = klass.__str__

510 klass.__str__ = lambda self: mark_safe(klass_str(self))

511 klass.__html__ = lambda self: str(self)

512 return klass