Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/django/utils/html.py: 63%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

251 statements  

1"""HTML utilities suitable for global use.""" 

2 

3import html 

4import json 

5import re 

6from collections.abc import Mapping 

7from html.parser import HTMLParser 

8from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit 

9 

10from django.core.exceptions import SuspiciousOperation 

11from django.utils.encoding import punycode 

12from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text 

13from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS 

14from django.utils.regex_helper import _lazy_re_compile 

15from django.utils.safestring import SafeData, SafeString, mark_safe 

16from django.utils.text import normalize_newlines 

17 

18# https://html.spec.whatwg.org/#void-elements 

19VOID_ELEMENTS = frozenset( 

20 ( 

21 "area", 

22 "base", 

23 "br", 

24 "col", 

25 "embed", 

26 "hr", 

27 "img", 

28 "input", 

29 "link", 

30 "meta", 

31 "param", 

32 "source", 

33 "track", 

34 "wbr", 

35 # Deprecated tags. 

36 "frame", 

37 "spacer", 

38 ) 

39) 

40 

41MAX_URL_LENGTH = 2048 

42MAX_STRIP_TAGS_DEPTH = 50 

43 

44 

45@keep_lazy(SafeString) 

46def escape(text): 

47 """ 

48 Return the given text with ampersands, quotes and angle brackets encoded 

49 for use in HTML. 

50 

51 Always escape input, even if it's already escaped and marked as such. 

52 This may result in double-escaping. If this is a concern, use 

53 conditional_escape() instead. 

54 """ 

55 return SafeString(html.escape(str(text))) 

56 

57 

58_js_escapes = { 

59 ord("\\"): "\\u005C", 

60 ord("'"): "\\u0027", 

61 ord('"'): "\\u0022", 

62 ord(">"): "\\u003E", 

63 ord("<"): "\\u003C", 

64 ord("&"): "\\u0026", 

65 ord("="): "\\u003D", 

66 ord("-"): "\\u002D", 

67 ord(";"): "\\u003B", 

68 ord("`"): "\\u0060", 

69 ord("\u2028"): "\\u2028", 

70 ord("\u2029"): "\\u2029", 

71} 

72 

73# Escape every ASCII character with a value less than 32. 

74_js_escapes.update((ord("%c" % z), "\\u%04X" % z) for z in range(32)) 

75 

76 

77@keep_lazy(SafeString) 

78def escapejs(value): 

79 """Hex encode characters for use in JavaScript strings.""" 

80 return mark_safe(str(value).translate(_js_escapes)) 

81 

82 

83_json_script_escapes = { 

84 ord(">"): "\\u003E", 

85 ord("<"): "\\u003C", 

86 ord("&"): "\\u0026", 

87} 

88 

89 

90def json_script(value, element_id=None, encoder=None): 

91 """ 

92 Escape all the HTML/XML special characters with their unicode escapes, so 

93 value is safe to be output anywhere except for inside a tag attribute. Wrap 

94 the escaped JSON in a script tag. 

95 """ 

96 from django.core.serializers.json import DjangoJSONEncoder 

97 

98 json_str = json.dumps(value, cls=encoder or DjangoJSONEncoder).translate( 

99 _json_script_escapes 

100 ) 

101 if element_id: 

102 template = '<script id="{}" type="application/json">{}</script>' 

103 args = (element_id, mark_safe(json_str)) 

104 else: 

105 template = '<script type="application/json">{}</script>' 

106 args = (mark_safe(json_str),) 

107 return format_html(template, *args) 

108 

109 

110def conditional_escape(text): 

111 """ 

112 Similar to escape(), except that it doesn't operate on pre-escaped strings. 

113 

114 This function relies on the __html__ convention used both by Django's 

115 SafeData class and by third-party libraries like markupsafe. 

116 """ 

117 if isinstance(text, Promise): 

118 text = str(text) 

119 if hasattr(text, "__html__"): 

120 return text.__html__() 

121 else: 

122 return escape(text) 

123 

124 

125def format_html(format_string, *args, **kwargs): 

126 """ 

127 Similar to str.format, but pass all arguments through conditional_escape(), 

128 and call mark_safe() on the result. This function should be used instead 

129 of str.format or % interpolation to build up small HTML fragments. 

130 """ 

131 if not (args or kwargs): 

132 raise TypeError("args or kwargs must be provided.") 

133 args_safe = map(conditional_escape, args) 

134 kwargs_safe = {k: conditional_escape(v) for (k, v) in kwargs.items()} 

135 return mark_safe(format_string.format(*args_safe, **kwargs_safe)) 

136 

137 

138def format_html_join(sep, format_string, args_generator): 

139 """ 

140 A wrapper of format_html, for the common case of a group of arguments that 

141 need to be formatted using the same format string, and then joined using 

142 'sep'. 'sep' is also passed through conditional_escape. 

143 

144 'args_generator' should be an iterator that returns the sequence of 'args' 

145 that will be passed to format_html. 

146 

147 Example: 

148 

149 format_html_join('\n', "<li>{} {}</li>", ((u.first_name, u.last_name) 

150 for u in users)) 

151 """ 

152 return mark_safe( 

153 conditional_escape(sep).join( 

154 ( 

155 format_html(format_string, **args) 

156 if isinstance(args, Mapping) 

157 else format_html(format_string, *args) 

158 ) 

159 for args in args_generator 

160 ) 

161 ) 

162 

163 

164@keep_lazy_text 

165def linebreaks(value, autoescape=False): 

166 """Convert newlines into <p> and <br>s.""" 

167 value = normalize_newlines(value) 

168 paras = re.split("\n{2,}", str(value)) 

169 if autoescape: 

170 paras = ["<p>%s</p>" % escape(p).replace("\n", "<br>") for p in paras] 

171 else: 

172 paras = ["<p>%s</p>" % p.replace("\n", "<br>") for p in paras] 

173 return "\n\n".join(paras) 

174 

175 

176class MLStripper(HTMLParser): 

177 def __init__(self): 

178 super().__init__(convert_charrefs=False) 

179 self.reset() 

180 self.fed = [] 

181 

182 def handle_data(self, d): 

183 self.fed.append(d) 

184 

185 def handle_entityref(self, name): 

186 self.fed.append("&%s;" % name) 

187 

188 def handle_charref(self, name): 

189 self.fed.append("&#%s;" % name) 

190 

191 def get_data(self): 

192 return "".join(self.fed) 

193 

194 

195def _strip_once(value): 

196 """ 

197 Internal tag stripping utility used by strip_tags. 

198 """ 

199 s = MLStripper() 

200 s.feed(value) 

201 s.close() 

202 return s.get_data() 

203 

204 

205@keep_lazy_text 

206def strip_tags(value): 

207 """Return the given HTML with all tags stripped.""" 

208 value = str(value) 

209 # Note: in typical case this loop executes _strip_once twice (the second 

210 # execution does not remove any more tags). 

211 strip_tags_depth = 0 

212 while "<" in value and ">" in value: 

213 if strip_tags_depth >= MAX_STRIP_TAGS_DEPTH: 

214 raise SuspiciousOperation 

215 new_value = _strip_once(value) 

216 if value.count("<") == new_value.count("<"): 

217 # _strip_once wasn't able to detect more tags. 

218 break 

219 value = new_value 

220 strip_tags_depth += 1 

221 return value 

222 

223 

224@keep_lazy_text 

225def strip_spaces_between_tags(value): 

226 """Return the given HTML with spaces between tags removed.""" 

227 return re.sub(r">\s+<", "><", str(value)) 

228 

229 

230def smart_urlquote(url): 

231 """Quote a URL if it isn't already quoted.""" 

232 

233 def unquote_quote(segment): 

234 segment = unquote(segment) 

235 # Tilde is part of RFC 3986 Section 2.3 Unreserved Characters, 

236 # see also https://bugs.python.org/issue16285 

237 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~") 

238 

239 # Handle IDN before quoting. 

240 try: 

241 scheme, netloc, path, query, fragment = urlsplit(url) 

242 except ValueError: 

243 # invalid IPv6 URL (normally square brackets in hostname part). 

244 return unquote_quote(url) 

245 

246 try: 

247 netloc = punycode(netloc) # IDN -> ACE 

248 except UnicodeError: # invalid domain part 

249 return unquote_quote(url) 

250 

251 if query: 

252 # Separately unquoting key/value, so as to not mix querystring separators 

253 # included in query values. See #22267. 

254 query_parts = [ 

255 (unquote(q[0]), unquote(q[1])) 

256 for q in parse_qsl(query, keep_blank_values=True) 

257 ] 

258 # urlencode will take care of quoting 

259 query = urlencode(query_parts) 

260 

261 path = unquote_quote(path) 

262 fragment = unquote_quote(fragment) 

263 

264 return urlunsplit((scheme, netloc, path, query, fragment)) 

265 

266 

267class CountsDict(dict): 

268 def __init__(self, *args, word, **kwargs): 

269 super().__init__(*args, *kwargs) 

270 self.word = word 

271 

272 def __missing__(self, key): 

273 self[key] = self.word.count(key) 

274 return self[key] 

275 

276 

277class Urlizer: 

278 """ 

279 Convert any URLs in text into clickable links. 

280 

281 Work on http://, https://, www. links, and also on links ending in one of 

282 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). 

283 Links can have trailing punctuation (periods, commas, close-parens) and 

284 leading punctuation (opening parens) and it'll still do the right thing. 

285 """ 

286 

287 trailing_punctuation_chars = ".,:;!" 

288 wrapping_punctuation = [("(", ")"), ("[", "]")] 

289 

290 simple_url_re = _lazy_re_compile(r"^https?://\[?\w", re.IGNORECASE) 

291 simple_url_2_re = _lazy_re_compile( 

292 r"^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$", re.IGNORECASE 

293 ) 

294 word_split_re = _lazy_re_compile(r"""([\s<>"']+)""") 

295 

296 mailto_template = "mailto:{local}@{domain}" 

297 url_template = '<a href="{href}"{attrs}>{url}</a>' 

298 

299 def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False): 

300 """ 

301 If trim_url_limit is not None, truncate the URLs in the link text 

302 longer than this limit to trim_url_limit - 1 characters and append an 

303 ellipsis. 

304 

305 If nofollow is True, give the links a rel="nofollow" attribute. 

306 

307 If autoescape is True, autoescape the link text and URLs. 

308 """ 

309 safe_input = isinstance(text, SafeData) 

310 

311 words = self.word_split_re.split(str(text)) 

312 local_cache = {} 

313 urlized_words = [] 

314 for word in words: 

315 if (urlized_word := local_cache.get(word)) is None: 

316 urlized_word = self.handle_word( 

317 word, 

318 safe_input=safe_input, 

319 trim_url_limit=trim_url_limit, 

320 nofollow=nofollow, 

321 autoescape=autoescape, 

322 ) 

323 local_cache[word] = urlized_word 

324 urlized_words.append(urlized_word) 

325 return "".join(urlized_words) 

326 

327 def handle_word( 

328 self, 

329 word, 

330 *, 

331 safe_input, 

332 trim_url_limit=None, 

333 nofollow=False, 

334 autoescape=False, 

335 ): 

336 if "." in word or "@" in word or ":" in word: 

337 # lead: Punctuation trimmed from the beginning of the word. 

338 # middle: State of the word. 

339 # trail: Punctuation trimmed from the end of the word. 

340 lead, middle, trail = self.trim_punctuation(word) 

341 # Make URL we want to point to. 

342 url = None 

343 nofollow_attr = ' rel="nofollow"' if nofollow else "" 

344 if len(middle) <= MAX_URL_LENGTH and self.simple_url_re.match(middle): 

345 url = smart_urlquote(html.unescape(middle)) 

346 elif len(middle) <= MAX_URL_LENGTH and self.simple_url_2_re.match(middle): 

347 url = smart_urlquote("http://%s" % html.unescape(middle)) 

348 elif ":" not in middle and self.is_email_simple(middle): 

349 local, domain = middle.rsplit("@", 1) 

350 try: 

351 domain = punycode(domain) 

352 except UnicodeError: 

353 return word 

354 local = quote(local, safe="") 

355 domain = quote(domain, safe="") 

356 url = self.mailto_template.format(local=local, domain=domain) 

357 nofollow_attr = "" 

358 # Make link. 

359 if url: 

360 trimmed = self.trim_url(middle, limit=trim_url_limit) 

361 if autoescape and not safe_input: 

362 lead, trail = escape(lead), escape(trail) 

363 trimmed = escape(trimmed) 

364 middle = self.url_template.format( 

365 href=escape(url), 

366 attrs=nofollow_attr, 

367 url=trimmed, 

368 ) 

369 return mark_safe(f"{lead}{middle}{trail}") 

370 else: 

371 if safe_input: 

372 return mark_safe(word) 

373 elif autoescape: 

374 return escape(word) 

375 elif safe_input: 

376 return mark_safe(word) 

377 elif autoescape: 

378 return escape(word) 

379 return word 

380 

381 def trim_url(self, x, *, limit): 

382 if limit is None or len(x) <= limit: 

383 return x 

384 return "%s…" % x[: max(0, limit - 1)] 

385 

386 @cached_property 

387 def wrapping_punctuation_openings(self): 

388 return "".join(dict(self.wrapping_punctuation).keys()) 

389 

390 @cached_property 

391 def trailing_punctuation_chars_no_semicolon(self): 

392 return self.trailing_punctuation_chars.replace(";", "") 

393 

394 @cached_property 

395 def trailing_punctuation_chars_has_semicolon(self): 

396 return ";" in self.trailing_punctuation_chars 

397 

398 def trim_punctuation(self, word): 

399 """ 

400 Trim trailing and wrapping punctuation from `word`. Return the items of 

401 the new state. 

402 """ 

403 # Strip all opening wrapping punctuation. 

404 middle = word.lstrip(self.wrapping_punctuation_openings) 

405 lead = word[: len(word) - len(middle)] 

406 trail = "" 

407 

408 # Continue trimming until middle remains unchanged. 

409 trimmed_something = True 

410 counts = CountsDict(word=middle) 

411 while trimmed_something and middle: 

412 trimmed_something = False 

413 # Trim wrapping punctuation. 

414 for opening, closing in self.wrapping_punctuation: 

415 if counts[opening] < counts[closing]: 

416 rstripped = middle.rstrip(closing) 

417 if rstripped != middle: 

418 strip = counts[closing] - counts[opening] 

419 trail = middle[-strip:] 

420 middle = middle[:-strip] 

421 trimmed_something = True 

422 counts[closing] -= strip 

423 

424 amp = middle.rfind("&") 

425 if amp == -1: 

426 rstripped = middle.rstrip(self.trailing_punctuation_chars) 

427 else: 

428 rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon) 

429 if rstripped != middle: 

430 trail = middle[len(rstripped) :] + trail 

431 middle = rstripped 

432 trimmed_something = True 

433 

434 if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"): 

435 # Only strip if not part of an HTML entity. 

436 potential_entity = middle[amp:] 

437 escaped = html.unescape(potential_entity) 

438 if escaped == potential_entity or escaped.endswith(";"): 

439 rstripped = middle.rstrip(self.trailing_punctuation_chars) 

440 trail_start = len(rstripped) 

441 amount_trailing_semicolons = len(middle) - len(middle.rstrip(";")) 

442 if amp > -1 and amount_trailing_semicolons > 1: 

443 # Leave up to most recent semicolon as might be an entity. 

444 recent_semicolon = middle[trail_start:].index(";") 

445 middle_semicolon_index = recent_semicolon + trail_start + 1 

446 trail = middle[middle_semicolon_index:] + trail 

447 middle = rstripped + middle[trail_start:middle_semicolon_index] 

448 else: 

449 trail = middle[trail_start:] + trail 

450 middle = rstripped 

451 trimmed_something = True 

452 

453 return lead, middle, trail 

454 

455 @staticmethod 

456 def is_email_simple(value): 

457 """Return True if value looks like an email address.""" 

458 # An @ must be in the middle of the value. 

459 if "@" not in value or value.startswith("@") or value.endswith("@"): 

460 return False 

461 try: 

462 p1, p2 = value.split("@") 

463 except ValueError: 

464 # value contains more than one @. 

465 return False 

466 # Max length for domain name labels is 63 characters per RFC 1034. 

467 # Helps to avoid ReDoS vectors in the domain part. 

468 if len(p2) > 63: 

469 return False 

470 # Dot must be in p2 (e.g. example.com) 

471 if "." not in p2 or p2.startswith("."): 

472 return False 

473 return True 

474 

475 

476urlizer = Urlizer() 

477 

478 

479@keep_lazy_text 

480def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): 

481 return urlizer( 

482 text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape 

483 ) 

484 

485 

486def avoid_wrapping(value): 

487 """ 

488 Avoid text wrapping in the middle of a phrase by adding non-breaking 

489 spaces where there previously were normal spaces. 

490 """ 

491 return value.replace(" ", "\xa0") 

492 

493 

494def html_safe(klass): 

495 """ 

496 A decorator that defines the __html__ method. This helps non-Django 

497 templates to detect classes whose __str__ methods return SafeString. 

498 """ 

499 if "__html__" in klass.__dict__: 

500 raise ValueError( 

501 "can't apply @html_safe to %s because it defines " 

502 "__html__()." % klass.__name__ 

503 ) 

504 if "__str__" not in klass.__dict__: 

505 raise ValueError( 

506 "can't apply @html_safe to %s because it doesn't " 

507 "define __str__()." % klass.__name__ 

508 ) 

509 klass_str = klass.__str__ 

510 klass.__str__ = lambda self: mark_safe(klass_str(self)) 

511 klass.__html__ = lambda self: str(self) 

512 return klass