Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/tornado/escape.py: 35%

4# Licensed under the Apache License, Version 2.0 (the "License"); you may

5# not use this file except in compliance with the License. You may obtain

6# a copy of the License at

8# http://www.apache.org/licenses/LICENSE-2.0

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

13# License for the specific language governing permissions and limitations

14# under the License.

16"""Escaping/unescaping methods for HTML, JSON, URLs, and others.

18Also includes a few other miscellaneous string manipulation functions that

19have crept in over time.

21Many functions in this module have near-equivalents in the standard library

22(the differences mainly relate to handling of bytes and unicode strings,

23and were more relevant in Python 2). In new code, the standard library

24functions are encouraged instead of this module where applicable. See the

25docstrings on each function for details.

26"""

28import html

29import json

30import re

31import urllib.parse

33from tornado.util import unicode_type

35import typing

36from typing import Union, Any, Optional, Dict, List, Callable

39def xhtml_escape(value: Union[str, bytes]) -> str:

40 """Escapes a string so it is valid within HTML or XML.

42 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.

43 When used in attribute values the escaped strings must be enclosed

44 in quotes.

46 Equivalent to `html.escape` except that this function always returns

47 type `str` while `html.escape` returns `bytes` if its input is `bytes`.

49 .. versionchanged:: 3.2

51 Added the single quote to the list of escaped characters.

53 .. versionchanged:: 6.4

55 Now simply wraps `html.escape`. This is equivalent to the old behavior

56 except that single quotes are now escaped as ``'`` instead of

57 ``'`` and performance may be different.

58 """

59 return html.escape(to_unicode(value))

62def xhtml_unescape(value: Union[str, bytes]) -> str:

63 """Un-escapes an XML-escaped string.

65 Equivalent to `html.unescape` except that this function always returns

66 type `str` while `html.unescape` returns `bytes` if its input is `bytes`.

68 .. versionchanged:: 6.4

70 Now simply wraps `html.unescape`. This changes behavior for some inputs

71 as required by the HTML 5 specification

72 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state

74 Some invalid inputs such as surrogates now raise an error, and numeric

75 references to certain ISO-8859-1 characters are now handled correctly.

76 """

77 return html.unescape(to_unicode(value))

80# The fact that json_encode wraps json.dumps is an implementation detail.

81# Please see https://github.com/tornadoweb/tornado/pull/706

82# before sending a pull request that adds **kwargs to this function.

83def json_encode(value: Any) -> str:

84 """JSON-encodes the given Python object.

86 Equivalent to `json.dumps` with the additional guarantee that the output

87 will never contain the character sequence ``</`` which can be problematic

88 when JSON is embedded in an HTML ``<script>`` tag.

89 """

90 # JSON permits but does not require forward slashes to be escaped.

91 # This is useful when json data is emitted in a <script> tag

92 # in HTML, as it prevents </script> tags from prematurely terminating

93 # the JavaScript. Some json libraries do this escaping by default,

94 # although python's standard library does not, so we do it here.

95 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped

96 return json.dumps(value).replace("</", "<\\/")

99def json_decode(value: Union[str, bytes]) -> Any:

100 """Returns Python objects for the given JSON string.

101

102 Supports both `str` and `bytes` inputs. Equvalent to `json.loads`.

103 """

104 return json.loads(value)

105

106

107def squeeze(value: str) -> str:

108 """Replace all sequences of whitespace chars with a single space."""

109 return re.sub(r"[\x00-\x20]+", " ", value).strip()

110

111

112def url_escape(value: Union[str, bytes], plus: bool = True) -> str:

113 """Returns a URL-encoded version of the given value.

114

115 Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus``

116 argument.

117

118 If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be

119 represented as ``%2F``. This is appropriate for query strings. If ``plus`` is false, spaces

120 will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path

121 component of a URL. Note that the default of ``plus=True`` is effectively the

122 reverse of Python's urllib module.

123

124 .. versionadded:: 3.1

125 The ``plus`` argument

126 """

127 quote = urllib.parse.quote_plus if plus else urllib.parse.quote

128 return quote(value)

129

130

131@typing.overload

132def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:

133 pass

134

135

136@typing.overload

137def url_unescape(

138 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True

139) -> str:

140 pass

141

142

143def url_unescape(

144 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True

145) -> Union[str, bytes]:

146 """Decodes the given value from a URL.

147

148 The argument may be either a byte or unicode string.

149

150 If encoding is None, the result will be a byte string and this function is equivalent to

151 `urllib.parse.unquote_to_bytes` if ``plus=False``. Otherwise, the result is a unicode string in

152 the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or

153 `urllib.parse.unquote` except that this function also accepts `bytes` as input.

154

155 If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs

156 must be represented as "%2B"). This is appropriate for query strings and form-encoded values

157 but not for the path component of a URL. Note that this default is the reverse of Python's

158 urllib module.

159

160 .. versionadded:: 3.1

161 The ``plus`` argument

162 """

163 if encoding is None:

164 if plus:

165 # unquote_to_bytes doesn't have a _plus variant

166 value = to_basestring(value).replace("+", " ")

167 return urllib.parse.unquote_to_bytes(value)

168 else:

169 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote

170 return unquote(to_basestring(value), encoding=encoding)

171

172

173def parse_qs_bytes(

174 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False

175) -> Dict[str, List[bytes]]:

176 """Parses a query string like urlparse.parse_qs,

177 but takes bytes and returns the values as byte strings.

178

179 Keys still become type str (interpreted as latin1 in python3!)

180 because it's too painful to keep them as byte strings in

181 python3 and in practice they're nearly always ascii anyway.

182 """

183 # This is gross, but python3 doesn't give us another way.

184 # Latin1 is the universal donor of character encodings.

185 if isinstance(qs, bytes):

186 qs = qs.decode("latin1")

187 result = urllib.parse.parse_qs(

188 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"

189 )

190 encoded = {}

191 for k, v in result.items():

192 encoded[k] = [i.encode("latin1") for i in v]

193 return encoded

194

195

196_UTF8_TYPES = (bytes, type(None))

197

198

199@typing.overload

200def utf8(value: bytes) -> bytes:

201 pass

202

203

204@typing.overload

205def utf8(value: str) -> bytes:

206 pass

207

208

209@typing.overload

210def utf8(value: None) -> None:

211 pass

212

213

214def utf8(value: Union[None, str, bytes]) -> Optional[bytes]:

215 """Converts a string argument to a byte string.

216

217 If the argument is already a byte string or None, it is returned unchanged.

218 Otherwise it must be a unicode string and is encoded as utf8.

219 """

220 if isinstance(value, _UTF8_TYPES):

221 return value

222 if not isinstance(value, unicode_type):

223 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))

224 return value.encode("utf-8")

225

226

227_TO_UNICODE_TYPES = (unicode_type, type(None))

228

229

230@typing.overload

231def to_unicode(value: str) -> str:

232 pass

233

234

235@typing.overload

236def to_unicode(value: bytes) -> str:

237 pass

238

239

240@typing.overload

241def to_unicode(value: None) -> None:

242 pass

243

244

245def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:

246 """Converts a string argument to a unicode string.

247

248 If the argument is already a unicode string or None, it is returned

249 unchanged. Otherwise it must be a byte string and is decoded as utf8.

250 """

251 if isinstance(value, _TO_UNICODE_TYPES):

252 return value

253 if not isinstance(value, bytes):

254 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))

255 return value.decode("utf-8")

256

257

258# to_unicode was previously named _unicode not because it was private,

259# but to avoid conflicts with the built-in unicode() function/type

260_unicode = to_unicode

261

262# When dealing with the standard library across python 2 and 3 it is

263# sometimes useful to have a direct conversion to the native string type

264native_str = to_unicode

265to_basestring = to_unicode

266

267

268def recursive_unicode(obj: Any) -> Any:

269 """Walks a simple data structure, converting byte strings to unicode.

270

271 Supports lists, tuples, and dictionaries.

272 """

273 if isinstance(obj, dict):

274 return {recursive_unicode(k): recursive_unicode(v) for (k, v) in obj.items()}

275 elif isinstance(obj, list):

276 return list(recursive_unicode(i) for i in obj)

277 elif isinstance(obj, tuple):

278 return tuple(recursive_unicode(i) for i in obj)

279 elif isinstance(obj, bytes):

280 return to_unicode(obj)

281 else:

282 return obj

283

284

285# I originally used the regex from

286# http://daringfireball.net/2010/07/improved_regex_for_matching_urls

287# but it gets all exponential on certain patterns (such as too many trailing

288# dots), causing the regex matcher to never return.

289# This regex should avoid those problems.

290# Use to_unicode instead of tornado.util.u - we don't want backslashes getting

291# processed as escapes.

292_URL_RE = re.compile(

293 to_unicode(

294 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:$(?:[^\s&()]|&|")*$))+)""" # noqa: E501

295 )

296)

297

298

299def linkify(

300 text: Union[str, bytes],

301 shorten: bool = False,

302 extra_params: Union[str, Callable[[str], str]] = "",

303 require_protocol: bool = False,

304 permitted_protocols: List[str] = ["http", "https"],

305) -> str:

306 """Converts plain text into HTML with links.

307

308 For example: ``linkify("Hello http://tornadoweb.org!")`` would return

309 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``

310

311 Parameters:

312

313 * ``shorten``: Long urls will be shortened for display.

314

315 * ``extra_params``: Extra text to include in the link tag, or a callable

316 taking the link as an argument and returning the extra text

317 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,

318 or::

319

320 def extra_params_cb(url):

321 if url.startswith("http://example.com"):

322 return 'class="internal"'

323 else:

324 return 'class="external" rel="nofollow"'

325 linkify(text, extra_params=extra_params_cb)

326

327 * ``require_protocol``: Only linkify urls which include a protocol. If

328 this is False, urls such as www.facebook.com will also be linkified.

329

330 * ``permitted_protocols``: List (or set) of protocols which should be

331 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",

332 "mailto"])``. It is very unsafe to include protocols such as

333 ``javascript``.

334 """

335 if extra_params and not callable(extra_params):

336 extra_params = " " + extra_params.strip()

337

338 def make_link(m: typing.Match) -> str:

339 url = m.group(1)

340 proto = m.group(2)

341 if require_protocol and not proto:

342 return url # not protocol, no linkify

343

344 if proto and proto not in permitted_protocols:

345 return url # bad protocol, no linkify

346

347 href = m.group(1)

348 if not proto:

349 href = "http://" + href # no proto specified, use http

350

351 if callable(extra_params):

352 params = " " + extra_params(href).strip()

353 else:

354 params = extra_params

355

356 # clip long urls. max_len is just an approximation

357 max_len = 30

358 if shorten and len(url) > max_len:

359 before_clip = url

360 if proto:

361 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :

362 else:

363 proto_len = 0

364

365 parts = url[proto_len:].split("/")

366 if len(parts) > 1:

367 # Grab the whole host part plus the first bit of the path

368 # The path is usually not that interesting once shortened

369 # (no more slug, etc), so it really just provides a little

370 # extra indication of shortening.

371 url = (

372 url[:proto_len]

373 + parts[0]

374 + "/"

375 + parts[1][:8].split("?")[0].split(".")[0]

376 )

377

378 if len(url) > max_len * 1.5: # still too long

379 url = url[:max_len]

380

381 if url != before_clip:

382 amp = url.rfind("&")

383 # avoid splitting html char entities

384 if amp > max_len - 5:

385 url = url[:amp]

386 url += "..."

387

388 if len(url) >= len(before_clip):

389 url = before_clip

390 else:

391 # full url is visible on mouse-over (for those who don't

392 # have a status bar, such as Safari by default)

393 params += ' title="%s"' % href

394

395 return f'<a href="{href}"{params}>{url}</a>'

396

397 # First HTML-escape so that our strings are all safe.

398 # The regex is modified to avoid character entites other than & so

399 # that we won't pick up ", etc.

400 text = _unicode(xhtml_escape(text))

401 return _URL_RE.sub(make_link, text)