Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tornado/escape.py: 36%

4# Licensed under the Apache License, Version 2.0 (the "License"); you may

5# not use this file except in compliance with the License. You may obtain

6# a copy of the License at

8# http://www.apache.org/licenses/LICENSE-2.0

10# Unless required by applicable law or agreed to in writing, software

11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

13# License for the specific language governing permissions and limitations

14# under the License.

16"""Escaping/unescaping methods for HTML, JSON, URLs, and others.

18Also includes a few other miscellaneous string manipulation functions that

19have crept in over time.

20"""

22import html.entities

23import json

24import re

25import urllib.parse

27from tornado.util import unicode_type

29import typing

30from typing import Union, Any, Optional, Dict, List, Callable

33_XHTML_ESCAPE_RE = re.compile("[&<>\"']")

34_XHTML_ESCAPE_DICT = {

35 "&": "&",

36 "<": "<",

37 ">": ">",

38 '"': """,

39 "'": "'",

40}

43def xhtml_escape(value: Union[str, bytes]) -> str:

44 """Escapes a string so it is valid within HTML or XML.

46 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.

47 When used in attribute values the escaped strings must be enclosed

48 in quotes.

50 .. versionchanged:: 3.2

52 Added the single quote to the list of escaped characters.

53 """

54 return _XHTML_ESCAPE_RE.sub(

55 lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value)

56 )

59def xhtml_unescape(value: Union[str, bytes]) -> str:

60 """Un-escapes an XML-escaped string."""

61 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))

64# The fact that json_encode wraps json.dumps is an implementation detail.

65# Please see https://github.com/tornadoweb/tornado/pull/706

66# before sending a pull request that adds **kwargs to this function.

67def json_encode(value: Any) -> str:

68 """JSON-encodes the given Python object."""

69 # JSON permits but does not require forward slashes to be escaped.

70 # This is useful when json data is emitted in a <script> tag

71 # in HTML, as it prevents </script> tags from prematurely terminating

72 # the JavaScript. Some json libraries do this escaping by default,

73 # although python's standard library does not, so we do it here.

74 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped

75 return json.dumps(value).replace("</", "<\\/")

78def json_decode(value: Union[str, bytes]) -> Any:

79 """Returns Python objects for the given JSON string.

81 Supports both `str` and `bytes` inputs.

82 """

83 return json.loads(to_basestring(value))

86def squeeze(value: str) -> str:

87 """Replace all sequences of whitespace chars with a single space."""

88 return re.sub(r"[\x00-\x20]+", " ", value).strip()

91def url_escape(value: Union[str, bytes], plus: bool = True) -> str:

92 """Returns a URL-encoded version of the given value.

94 If ``plus`` is true (the default), spaces will be represented

95 as "+" instead of "%20". This is appropriate for query strings

96 but not for the path component of a URL. Note that this default

97 is the reverse of Python's urllib module.

99 .. versionadded:: 3.1

100 The ``plus`` argument

101 """

102 quote = urllib.parse.quote_plus if plus else urllib.parse.quote

103 return quote(utf8(value))

104

105

106@typing.overload

107def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:

108 pass

109

110

111@typing.overload # noqa: F811

112def url_unescape(

113 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True

114) -> str:

115 pass

116

117

118def url_unescape( # noqa: F811

119 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True

120) -> Union[str, bytes]:

121 """Decodes the given value from a URL.

122

123 The argument may be either a byte or unicode string.

124

125 If encoding is None, the result will be a byte string. Otherwise,

126 the result is a unicode string in the specified encoding.

127

128 If ``plus`` is true (the default), plus signs will be interpreted

129 as spaces (literal plus signs must be represented as "%2B"). This

130 is appropriate for query strings and form-encoded values but not

131 for the path component of a URL. Note that this default is the

132 reverse of Python's urllib module.

133

134 .. versionadded:: 3.1

135 The ``plus`` argument

136 """

137 if encoding is None:

138 if plus:

139 # unquote_to_bytes doesn't have a _plus variant

140 value = to_basestring(value).replace("+", " ")

141 return urllib.parse.unquote_to_bytes(value)

142 else:

143 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote

144 return unquote(to_basestring(value), encoding=encoding)

145

146

147def parse_qs_bytes(

148 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False

149) -> Dict[str, List[bytes]]:

150 """Parses a query string like urlparse.parse_qs,

151 but takes bytes and returns the values as byte strings.

152

153 Keys still become type str (interpreted as latin1 in python3!)

154 because it's too painful to keep them as byte strings in

155 python3 and in practice they're nearly always ascii anyway.

156 """

157 # This is gross, but python3 doesn't give us another way.

158 # Latin1 is the universal donor of character encodings.

159 if isinstance(qs, bytes):

160 qs = qs.decode("latin1")

161 result = urllib.parse.parse_qs(

162 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"

163 )

164 encoded = {}

165 for k, v in result.items():

166 encoded[k] = [i.encode("latin1") for i in v]

167 return encoded

168

169

170_UTF8_TYPES = (bytes, type(None))

171

172

173@typing.overload

174def utf8(value: bytes) -> bytes:

175 pass

176

177

178@typing.overload # noqa: F811

179def utf8(value: str) -> bytes:

180 pass

181

182

183@typing.overload # noqa: F811

184def utf8(value: None) -> None:

185 pass

186

187

188def utf8(value: Union[None, str, bytes]) -> Optional[bytes]: # noqa: F811

189 """Converts a string argument to a byte string.

190

191 If the argument is already a byte string or None, it is returned unchanged.

192 Otherwise it must be a unicode string and is encoded as utf8.

193 """

194 if isinstance(value, _UTF8_TYPES):

195 return value

196 if not isinstance(value, unicode_type):

197 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))

198 return value.encode("utf-8")

199

200

201_TO_UNICODE_TYPES = (unicode_type, type(None))

202

203

204@typing.overload

205def to_unicode(value: str) -> str:

206 pass

207

208

209@typing.overload # noqa: F811

210def to_unicode(value: bytes) -> str:

211 pass

212

213

214@typing.overload # noqa: F811

215def to_unicode(value: None) -> None:

216 pass

217

218

219def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811

220 """Converts a string argument to a unicode string.

221

222 If the argument is already a unicode string or None, it is returned

223 unchanged. Otherwise it must be a byte string and is decoded as utf8.

224 """

225 if isinstance(value, _TO_UNICODE_TYPES):

226 return value

227 if not isinstance(value, bytes):

228 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))

229 return value.decode("utf-8")

230

231

232# to_unicode was previously named _unicode not because it was private,

233# but to avoid conflicts with the built-in unicode() function/type

234_unicode = to_unicode

235

236# When dealing with the standard library across python 2 and 3 it is

237# sometimes useful to have a direct conversion to the native string type

238native_str = to_unicode

239to_basestring = to_unicode

240

241

242def recursive_unicode(obj: Any) -> Any:

243 """Walks a simple data structure, converting byte strings to unicode.

244

245 Supports lists, tuples, and dictionaries.

246 """

247 if isinstance(obj, dict):

248 return dict(

249 (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items()

250 )

251 elif isinstance(obj, list):

252 return list(recursive_unicode(i) for i in obj)

253 elif isinstance(obj, tuple):

254 return tuple(recursive_unicode(i) for i in obj)

255 elif isinstance(obj, bytes):

256 return to_unicode(obj)

257 else:

258 return obj

259

260

261# I originally used the regex from

262# http://daringfireball.net/2010/07/improved_regex_for_matching_urls

263# but it gets all exponential on certain patterns (such as too many trailing

264# dots), causing the regex matcher to never return.

265# This regex should avoid those problems.

266# Use to_unicode instead of tornado.util.u - we don't want backslashes getting

267# processed as escapes.

268_URL_RE = re.compile(

269 to_unicode(

270 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:$(?:[^\s&()]|&|")*$))+)""" # noqa: E501

271 )

272)

273

274

275def linkify(

276 text: Union[str, bytes],

277 shorten: bool = False,

278 extra_params: Union[str, Callable[[str], str]] = "",

279 require_protocol: bool = False,

280 permitted_protocols: List[str] = ["http", "https"],

281) -> str:

282 """Converts plain text into HTML with links.

283

284 For example: ``linkify("Hello http://tornadoweb.org!")`` would return

285 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``

286

287 Parameters:

288

289 * ``shorten``: Long urls will be shortened for display.

290

291 * ``extra_params``: Extra text to include in the link tag, or a callable

292 taking the link as an argument and returning the extra text

293 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,

294 or::

295

296 def extra_params_cb(url):

297 if url.startswith("http://example.com"):

298 return 'class="internal"'

299 else:

300 return 'class="external" rel="nofollow"'

301 linkify(text, extra_params=extra_params_cb)

302

303 * ``require_protocol``: Only linkify urls which include a protocol. If

304 this is False, urls such as www.facebook.com will also be linkified.

305

306 * ``permitted_protocols``: List (or set) of protocols which should be

307 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",

308 "mailto"])``. It is very unsafe to include protocols such as

309 ``javascript``.

310 """

311 if extra_params and not callable(extra_params):

312 extra_params = " " + extra_params.strip()

313

314 def make_link(m: typing.Match) -> str:

315 url = m.group(1)

316 proto = m.group(2)

317 if require_protocol and not proto:

318 return url # not protocol, no linkify

319

320 if proto and proto not in permitted_protocols:

321 return url # bad protocol, no linkify

322

323 href = m.group(1)

324 if not proto:

325 href = "http://" + href # no proto specified, use http

326

327 if callable(extra_params):

328 params = " " + extra_params(href).strip()

329 else:

330 params = extra_params

331

332 # clip long urls. max_len is just an approximation

333 max_len = 30

334 if shorten and len(url) > max_len:

335 before_clip = url

336 if proto:

337 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :

338 else:

339 proto_len = 0

340

341 parts = url[proto_len:].split("/")

342 if len(parts) > 1:

343 # Grab the whole host part plus the first bit of the path

344 # The path is usually not that interesting once shortened

345 # (no more slug, etc), so it really just provides a little

346 # extra indication of shortening.

347 url = (

348 url[:proto_len]

349 + parts[0]

350 + "/"

351 + parts[1][:8].split("?")[0].split(".")[0]

352 )

353

354 if len(url) > max_len * 1.5: # still too long

355 url = url[:max_len]

356

357 if url != before_clip:

358 amp = url.rfind("&")

359 # avoid splitting html char entities

360 if amp > max_len - 5:

361 url = url[:amp]

362 url += "..."

363

364 if len(url) >= len(before_clip):

365 url = before_clip

366 else:

367 # full url is visible on mouse-over (for those who don't

368 # have a status bar, such as Safari by default)

369 params += ' title="%s"' % href

370

371 return '<a href="%s"%s>%s</a>' % (href, params, url)

372

373 # First HTML-escape so that our strings are all safe.

374 # The regex is modified to avoid character entites other than & so

375 # that we won't pick up ", etc.

376 text = _unicode(xhtml_escape(text))

377 return _URL_RE.sub(make_link, text)

378

379

380def _convert_entity(m: typing.Match) -> str:

381 if m.group(1) == "#":

382 try:

383 if m.group(2)[:1].lower() == "x":

384 return chr(int(m.group(2)[1:], 16))

385 else:

386 return chr(int(m.group(2)))

387 except ValueError:

388 return "&#%s;" % m.group(2)

389 try:

390 return _HTML_UNICODE_MAP[m.group(2)]

391 except KeyError:

392 return "&%s;" % m.group(2)

393

394

395def _build_unicode_map() -> Dict[str, str]:

396 unicode_map = {}

397 for name, value in html.entities.name2codepoint.items():

398 unicode_map[name] = chr(value)

399 return unicode_map

400

401

402_HTML_UNICODE_MAP = _build_unicode_map()