Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/utils/html.py: 31%

1"""HTML utilities suitable for global use."""

3import html

4import json

5import re

6from html.parser import HTMLParser

7from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit

9from django.utils.encoding import punycode

10from django.utils.functional import Promise, keep_lazy, keep_lazy_text

11from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS

12from django.utils.regex_helper import _lazy_re_compile

13from django.utils.safestring import SafeData, SafeString, mark_safe

14from django.utils.text import normalize_newlines

17@keep_lazy(SafeString)

18def escape(text):

19 """

20 Return the given text with ampersands, quotes and angle brackets encoded

21 for use in HTML.

23 Always escape input, even if it's already escaped and marked as such.

24 This may result in double-escaping. If this is a concern, use

25 conditional_escape() instead.

26 """

27 return SafeString(html.escape(str(text)))

30_js_escapes = {

31 ord("\\"): "\\u005C",

32 ord("'"): "\\u0027",

33 ord('"'): "\\u0022",

34 ord(">"): "\\u003E",

35 ord("<"): "\\u003C",

36 ord("&"): "\\u0026",

37 ord("="): "\\u003D",

38 ord("-"): "\\u002D",

39 ord(";"): "\\u003B",

40 ord("`"): "\\u0060",

41 ord("\u2028"): "\\u2028",

42 ord("\u2029"): "\\u2029",

43}

45# Escape every ASCII character with a value less than 32.

46_js_escapes.update((ord("%c" % z), "\\u%04X" % z) for z in range(32))

49@keep_lazy(SafeString)

50def escapejs(value):

51 """Hex encode characters for use in JavaScript strings."""

52 return mark_safe(str(value).translate(_js_escapes))

55_json_script_escapes = {

56 ord(">"): "\\u003E",

57 ord("<"): "\\u003C",

58 ord("&"): "\\u0026",

59}

62def json_script(value, element_id=None, encoder=None):

63 """

64 Escape all the HTML/XML special characters with their unicode escapes, so

65 value is safe to be output anywhere except for inside a tag attribute. Wrap

66 the escaped JSON in a script tag.

67 """

68 from django.core.serializers.json import DjangoJSONEncoder

70 json_str = json.dumps(value, cls=encoder or DjangoJSONEncoder).translate(

71 _json_script_escapes

72 )

73 if element_id:

74 template = '<script id="{}" type="application/json">{}</script>'

75 args = (element_id, mark_safe(json_str))

76 else:

77 template = '<script type="application/json">{}</script>'

78 args = (mark_safe(json_str),)

79 return format_html(template, *args)

82def conditional_escape(text):

83 """

84 Similar to escape(), except that it doesn't operate on pre-escaped strings.

86 This function relies on the __html__ convention used both by Django's

87 SafeData class and by third-party libraries like markupsafe.

88 """

89 if isinstance(text, Promise):

90 text = str(text)

91 if hasattr(text, "__html__"):

92 return text.__html__()

93 else:

94 return escape(text)

97def format_html(format_string, *args, **kwargs):

98 """

99 Similar to str.format, but pass all arguments through conditional_escape(),

100 and call mark_safe() on the result. This function should be used instead

101 of str.format or % interpolation to build up small HTML fragments.

102 """

103 args_safe = map(conditional_escape, args)

104 kwargs_safe = {k: conditional_escape(v) for (k, v) in kwargs.items()}

105 return mark_safe(format_string.format(*args_safe, **kwargs_safe))

106

107

108def format_html_join(sep, format_string, args_generator):

109 """

110 A wrapper of format_html, for the common case of a group of arguments that

111 need to be formatted using the same format string, and then joined using

112 'sep'. 'sep' is also passed through conditional_escape.

113

114 'args_generator' should be an iterator that returns the sequence of 'args'

115 that will be passed to format_html.

116

117 Example:

118

119 format_html_join('\n', "<li>{} {}</li>", ((u.first_name, u.last_name)

120 for u in users))

121 """

122 return mark_safe(

123 conditional_escape(sep).join(

124 format_html(format_string, *args) for args in args_generator

125 )

126 )

127

128

129@keep_lazy_text

130def linebreaks(value, autoescape=False):

131 """Convert newlines into <p> and <br>s."""

132 value = normalize_newlines(value)

133 paras = re.split("\n{2,}", str(value))

134 if autoescape:

135 paras = ["<p>%s</p>" % escape(p).replace("\n", "<br>") for p in paras]

136 else:

137 paras = ["<p>%s</p>" % p.replace("\n", "<br>") for p in paras]

138 return "\n\n".join(paras)

139

140

141class MLStripper(HTMLParser):

142 def __init__(self):

143 super().__init__(convert_charrefs=False)

144 self.reset()

145 self.fed = []

146

147 def handle_data(self, d):

148 self.fed.append(d)

149

150 def handle_entityref(self, name):

151 self.fed.append("&%s;" % name)

152

153 def handle_charref(self, name):

154 self.fed.append("&#%s;" % name)

155

156 def get_data(self):

157 return "".join(self.fed)

158

159

160def _strip_once(value):

161 """

162 Internal tag stripping utility used by strip_tags.

163 """

164 s = MLStripper()

165 s.feed(value)

166 s.close()

167 return s.get_data()

168

169

170@keep_lazy_text

171def strip_tags(value):

172 """Return the given HTML with all tags stripped."""

173 # Note: in typical case this loop executes _strip_once once. Loop condition

174 # is redundant, but helps to reduce number of executions of _strip_once.

175 value = str(value)

176 while "<" in value and ">" in value:

177 new_value = _strip_once(value)

178 if value.count("<") == new_value.count("<"):

179 # _strip_once wasn't able to detect more tags.

180 break

181 value = new_value

182 return value

183

184

185@keep_lazy_text

186def strip_spaces_between_tags(value):

187 """Return the given HTML with spaces between tags removed."""

188 return re.sub(r">\s+<", "><", str(value))

189

190

191def smart_urlquote(url):

192 """Quote a URL if it isn't already quoted."""

193

194 def unquote_quote(segment):

195 segment = unquote(segment)

196 # Tilde is part of RFC 3986 Section 2.3 Unreserved Characters,

197 # see also https://bugs.python.org/issue16285

198 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~")

199

200 # Handle IDN before quoting.

201 try:

202 scheme, netloc, path, query, fragment = urlsplit(url)

203 except ValueError:

204 # invalid IPv6 URL (normally square brackets in hostname part).

205 return unquote_quote(url)

206

207 try:

208 netloc = punycode(netloc) # IDN -> ACE

209 except UnicodeError: # invalid domain part

210 return unquote_quote(url)

211

212 if query:

213 # Separately unquoting key/value, so as to not mix querystring separators

214 # included in query values. See #22267.

215 query_parts = [

216 (unquote(q[0]), unquote(q[1]))

217 for q in parse_qsl(query, keep_blank_values=True)

218 ]

219 # urlencode will take care of quoting

220 query = urlencode(query_parts)

221

222 path = unquote_quote(path)

223 fragment = unquote_quote(fragment)

224

225 return urlunsplit((scheme, netloc, path, query, fragment))

226

227

228class Urlizer:

229 """

230 Convert any URLs in text into clickable links.

231

232 Work on http://, https://, www. links, and also on links ending in one of

233 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).

234 Links can have trailing punctuation (periods, commas, close-parens) and

235 leading punctuation (opening parens) and it'll still do the right thing.

236 """

237

238 trailing_punctuation_chars = ".,:;!"

239 wrapping_punctuation = [("(", ")"), ("[", "]")]

240

241 simple_url_re = _lazy_re_compile(r"^https?://\[?\w", re.IGNORECASE)

242 simple_url_2_re = _lazy_re_compile(

243 r"^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$", re.IGNORECASE

244 )

245 word_split_re = _lazy_re_compile(r"""([\s<>"']+)""")

246

247 mailto_template = "mailto:{local}@{domain}"

248 url_template = '<a href="{href}"{attrs}>{url}</a>'

249

250 def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):

251 """

252 If trim_url_limit is not None, truncate the URLs in the link text

253 longer than this limit to trim_url_limit - 1 characters and append an

254 ellipsis.

255

256 If nofollow is True, give the links a rel="nofollow" attribute.

257

258 If autoescape is True, autoescape the link text and URLs.

259 """

260 safe_input = isinstance(text, SafeData)

261

262 words = self.word_split_re.split(str(text))

263 return "".join(

264 [

265 self.handle_word(

266 word,

267 safe_input=safe_input,

268 trim_url_limit=trim_url_limit,

269 nofollow=nofollow,

270 autoescape=autoescape,

271 )

272 for word in words

273 ]

274 )

275

276 def handle_word(

277 self,

278 word,

279 *,

280 safe_input,

281 trim_url_limit=None,

282 nofollow=False,

283 autoescape=False,

284 ):

285 if "." in word or "@" in word or ":" in word:

286 # lead: Punctuation trimmed from the beginning of the word.

287 # middle: State of the word.

288 # trail: Punctuation trimmed from the end of the word.

289 lead, middle, trail = self.trim_punctuation(word)

290 # Make URL we want to point to.

291 url = None

292 nofollow_attr = ' rel="nofollow"' if nofollow else ""

293 if self.simple_url_re.match(middle):

294 url = smart_urlquote(html.unescape(middle))

295 elif self.simple_url_2_re.match(middle):

296 url = smart_urlquote("http://%s" % html.unescape(middle))

297 elif ":" not in middle and self.is_email_simple(middle):

298 local, domain = middle.rsplit("@", 1)

299 try:

300 domain = punycode(domain)

301 except UnicodeError:

302 return word

303 url = self.mailto_template.format(local=local, domain=domain)

304 nofollow_attr = ""

305 # Make link.

306 if url:

307 trimmed = self.trim_url(middle, limit=trim_url_limit)

308 if autoescape and not safe_input:

309 lead, trail = escape(lead), escape(trail)

310 trimmed = escape(trimmed)

311 middle = self.url_template.format(

312 href=escape(url),

313 attrs=nofollow_attr,

314 url=trimmed,

315 )

316 return mark_safe(f"{lead}{middle}{trail}")

317 else:

318 if safe_input:

319 return mark_safe(word)

320 elif autoescape:

321 return escape(word)

322 elif safe_input:

323 return mark_safe(word)

324 elif autoescape:

325 return escape(word)

326 return word

327

328 def trim_url(self, x, *, limit):

329 if limit is None or len(x) <= limit:

330 return x

331 return "%s…" % x[: max(0, limit - 1)]

332

333 def trim_punctuation(self, word):

334 """

335 Trim trailing and wrapping punctuation from `word`. Return the items of

336 the new state.

337 """

338 lead, middle, trail = "", word, ""

339 # Continue trimming until middle remains unchanged.

340 trimmed_something = True

341 while trimmed_something:

342 trimmed_something = False

343 # Trim wrapping punctuation.

344 for opening, closing in self.wrapping_punctuation:

345 if middle.startswith(opening):

346 middle = middle[len(opening) :]

347 lead += opening

348 trimmed_something = True

349 # Keep parentheses at the end only if they're balanced.

350 if (

351 middle.endswith(closing)

352 and middle.count(closing) == middle.count(opening) + 1

353 ):

354 middle = middle[: -len(closing)]

355 trail = closing + trail

356 trimmed_something = True

357 # Trim trailing punctuation (after trimming wrapping punctuation,

358 # as encoded entities contain ';'). Unescape entities to avoid

359 # breaking them by removing ';'.

360 middle_unescaped = html.unescape(middle)

361 stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)

362 if middle_unescaped != stripped:

363 punctuation_count = len(middle_unescaped) - len(stripped)

364 trail = middle[-punctuation_count:] + trail

365 middle = middle[:-punctuation_count]

366 trimmed_something = True

367 return lead, middle, trail

368

369 @staticmethod

370 def is_email_simple(value):

371 """Return True if value looks like an email address."""

372 # An @ must be in the middle of the value.

373 if "@" not in value or value.startswith("@") or value.endswith("@"):

374 return False

375 try:

376 p1, p2 = value.split("@")

377 except ValueError:

378 # value contains more than one @.

379 return False

380 # Dot must be in p2 (e.g. example.com)

381 if "." not in p2 or p2.startswith("."):

382 return False

383 return True

384

385

386urlizer = Urlizer()

387

388

389@keep_lazy_text

390def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):

391 return urlizer(

392 text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape

393 )

394

395

396def avoid_wrapping(value):

397 """

398 Avoid text wrapping in the middle of a phrase by adding non-breaking

399 spaces where there previously were normal spaces.

400 """

401 return value.replace(" ", "\xa0")

402

403

404def html_safe(klass):

405 """

406 A decorator that defines the __html__ method. This helps non-Django

407 templates to detect classes whose __str__ methods return SafeString.

408 """

409 if "__html__" in klass.__dict__:

410 raise ValueError(

411 "can't apply @html_safe to %s because it defines "

412 "__html__()." % klass.__name__

413 )

414 if "__str__" not in klass.__dict__:

415 raise ValueError(

416 "can't apply @html_safe to %s because it doesn't "

417 "define __str__()." % klass.__name__

418 )

419 klass_str = klass.__str__

420 klass.__str__ = lambda self: mark_safe(klass_str(self))

421 klass.__html__ = lambda self: str(self)

422 return klass