Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tornado/escape.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

126 statements  

1# 

2# Copyright 2009 Facebook 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); you may 

5# not use this file except in compliance with the License. You may obtain 

6# a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

13# License for the specific language governing permissions and limitations 

14# under the License. 

15 

16"""Escaping/unescaping methods for HTML, JSON, URLs, and others. 

17 

18Also includes a few other miscellaneous string manipulation functions that 

19have crept in over time. 

20 

21Many functions in this module have near-equivalents in the standard library 

22(the differences mainly relate to handling of bytes and unicode strings, 

23and were more relevant in Python 2). In new code, the standard library 

24functions are encouraged instead of this module where applicable. See the 

25docstrings on each function for details. 

26""" 

27 

28import html 

29import json 

30import re 

31import urllib.parse 

32 

33from tornado.util import unicode_type 

34 

35import typing 

36from typing import Union, Any, Optional, Dict, List, Callable 

37 

38 

39def xhtml_escape(value: Union[str, bytes]) -> str: 

40 """Escapes a string so it is valid within HTML or XML. 

41 

42 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``. 

43 When used in attribute values the escaped strings must be enclosed 

44 in quotes. 

45 

46 Equivalent to `html.escape` except that this function always returns 

47 type `str` while `html.escape` returns `bytes` if its input is `bytes`. 

48 

49 .. versionchanged:: 3.2 

50 

51 Added the single quote to the list of escaped characters. 

52 

53 .. versionchanged:: 6.4 

54 

55 Now simply wraps `html.escape`. This is equivalent to the old behavior 

56 except that single quotes are now escaped as ``&#x27;`` instead of 

57 ``&#39;`` and performance may be different. 

58 """ 

59 return html.escape(to_unicode(value)) 

60 

61 

62def xhtml_unescape(value: Union[str, bytes]) -> str: 

63 """Un-escapes an XML-escaped string. 

64 

65 Equivalent to `html.unescape` except that this function always returns 

66 type `str` while `html.unescape` returns `bytes` if its input is `bytes`. 

67 

68 .. versionchanged:: 6.4 

69 

70 Now simply wraps `html.unescape`. This changes behavior for some inputs 

71 as required by the HTML 5 specification 

72 https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state 

73 

74 Some invalid inputs such as surrogates now raise an error, and numeric 

75 references to certain ISO-8859-1 characters are now handled correctly. 

76 """ 

77 return html.unescape(to_unicode(value)) 

78 

79 

80# The fact that json_encode wraps json.dumps is an implementation detail. 

81# Please see https://github.com/tornadoweb/tornado/pull/706 

82# before sending a pull request that adds **kwargs to this function. 

83def json_encode(value: Any) -> str: 

84 """JSON-encodes the given Python object. 

85 

86 Equivalent to `json.dumps` with the additional guarantee that the output 

87 will never contain the character sequence ``</`` which can be problematic 

88 when JSON is embedded in an HTML ``<script>`` tag. 

89 """ 

90 # JSON permits but does not require forward slashes to be escaped. 

91 # This is useful when json data is emitted in a <script> tag 

92 # in HTML, as it prevents </script> tags from prematurely terminating 

93 # the JavaScript. Some json libraries do this escaping by default, 

94 # although python's standard library does not, so we do it here. 

95 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped 

96 return json.dumps(value).replace("</", "<\\/") 

97 

98 

99def json_decode(value: Union[str, bytes]) -> Any: 

100 """Returns Python objects for the given JSON string. 

101 

102 Supports both `str` and `bytes` inputs. Equvalent to `json.loads`. 

103 """ 

104 return json.loads(value) 

105 

106 

107def squeeze(value: str) -> str: 

108 """Replace all sequences of whitespace chars with a single space.""" 

109 return re.sub(r"[\x00-\x20]+", " ", value).strip() 

110 

111 

112def url_escape(value: Union[str, bytes], plus: bool = True) -> str: 

113 """Returns a URL-encoded version of the given value. 

114 

115 Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus`` 

116 argument. 

117 

118 If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be 

119 represented as ``%2F``. This is appropriate for query strings. If ``plus`` is false, spaces 

120 will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path 

121 component of a URL. Note that the default of ``plus=True`` is effectively the 

122 reverse of Python's urllib module. 

123 

124 .. versionadded:: 3.1 

125 The ``plus`` argument 

126 """ 

127 quote = urllib.parse.quote_plus if plus else urllib.parse.quote 

128 return quote(value) 

129 

130 

131@typing.overload 

132def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes: 

133 pass 

134 

135 

136@typing.overload 

137def url_unescape( 

138 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True 

139) -> str: 

140 pass 

141 

142 

143def url_unescape( 

144 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True 

145) -> Union[str, bytes]: 

146 """Decodes the given value from a URL. 

147 

148 The argument may be either a byte or unicode string. 

149 

150 If encoding is None, the result will be a byte string and this function is equivalent to 

151 `urllib.parse.unquote_to_bytes` if ``plus=False``. Otherwise, the result is a unicode string in 

152 the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or 

153 `urllib.parse.unquote` except that this function also accepts `bytes` as input. 

154 

155 If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs 

156 must be represented as "%2B"). This is appropriate for query strings and form-encoded values 

157 but not for the path component of a URL. Note that this default is the reverse of Python's 

158 urllib module. 

159 

160 .. versionadded:: 3.1 

161 The ``plus`` argument 

162 """ 

163 if encoding is None: 

164 if plus: 

165 # unquote_to_bytes doesn't have a _plus variant 

166 value = to_basestring(value).replace("+", " ") 

167 return urllib.parse.unquote_to_bytes(value) 

168 else: 

169 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote 

170 return unquote(to_basestring(value), encoding=encoding) 

171 

172 

173def parse_qs_bytes( 

174 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False 

175) -> Dict[str, List[bytes]]: 

176 """Parses a query string like urlparse.parse_qs, 

177 but takes bytes and returns the values as byte strings. 

178 

179 Keys still become type str (interpreted as latin1 in python3!) 

180 because it's too painful to keep them as byte strings in 

181 python3 and in practice they're nearly always ascii anyway. 

182 """ 

183 # This is gross, but python3 doesn't give us another way. 

184 # Latin1 is the universal donor of character encodings. 

185 if isinstance(qs, bytes): 

186 qs = qs.decode("latin1") 

187 result = urllib.parse.parse_qs( 

188 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict" 

189 ) 

190 encoded = {} 

191 for k, v in result.items(): 

192 encoded[k] = [i.encode("latin1") for i in v] 

193 return encoded 

194 

195 

196_UTF8_TYPES = (bytes, type(None)) 

197 

198 

199@typing.overload 

200def utf8(value: bytes) -> bytes: 

201 pass 

202 

203 

204@typing.overload 

205def utf8(value: str) -> bytes: 

206 pass 

207 

208 

209@typing.overload 

210def utf8(value: None) -> None: 

211 pass 

212 

213 

214def utf8(value: Union[None, str, bytes]) -> Optional[bytes]: 

215 """Converts a string argument to a byte string. 

216 

217 If the argument is already a byte string or None, it is returned unchanged. 

218 Otherwise it must be a unicode string and is encoded as utf8. 

219 """ 

220 if isinstance(value, _UTF8_TYPES): 

221 return value 

222 if not isinstance(value, unicode_type): 

223 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 

224 return value.encode("utf-8") 

225 

226 

227_TO_UNICODE_TYPES = (unicode_type, type(None)) 

228 

229 

230@typing.overload 

231def to_unicode(value: str) -> str: 

232 pass 

233 

234 

235@typing.overload 

236def to_unicode(value: bytes) -> str: 

237 pass 

238 

239 

240@typing.overload 

241def to_unicode(value: None) -> None: 

242 pass 

243 

244 

245def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: 

246 """Converts a string argument to a unicode string. 

247 

248 If the argument is already a unicode string or None, it is returned 

249 unchanged. Otherwise it must be a byte string and is decoded as utf8. 

250 """ 

251 if isinstance(value, _TO_UNICODE_TYPES): 

252 return value 

253 if not isinstance(value, bytes): 

254 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 

255 return value.decode("utf-8") 

256 

257 

258# to_unicode was previously named _unicode not because it was private, 

259# but to avoid conflicts with the built-in unicode() function/type 

260_unicode = to_unicode 

261 

262# When dealing with the standard library across python 2 and 3 it is 

263# sometimes useful to have a direct conversion to the native string type 

264native_str = to_unicode 

265to_basestring = to_unicode 

266 

267 

268def recursive_unicode(obj: Any) -> Any: 

269 """Walks a simple data structure, converting byte strings to unicode. 

270 

271 Supports lists, tuples, and dictionaries. 

272 """ 

273 if isinstance(obj, dict): 

274 return dict( 

275 (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items() 

276 ) 

277 elif isinstance(obj, list): 

278 return list(recursive_unicode(i) for i in obj) 

279 elif isinstance(obj, tuple): 

280 return tuple(recursive_unicode(i) for i in obj) 

281 elif isinstance(obj, bytes): 

282 return to_unicode(obj) 

283 else: 

284 return obj 

285 

286 

287# I originally used the regex from 

288# http://daringfireball.net/2010/07/improved_regex_for_matching_urls 

289# but it gets all exponential on certain patterns (such as too many trailing 

290# dots), causing the regex matcher to never return. 

291# This regex should avoid those problems. 

292# Use to_unicode instead of tornado.util.u - we don't want backslashes getting 

293# processed as escapes. 

294_URL_RE = re.compile( 

295 to_unicode( 

296 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""" # noqa: E501 

297 ) 

298) 

299 

300 

301def linkify( 

302 text: Union[str, bytes], 

303 shorten: bool = False, 

304 extra_params: Union[str, Callable[[str], str]] = "", 

305 require_protocol: bool = False, 

306 permitted_protocols: List[str] = ["http", "https"], 

307) -> str: 

308 """Converts plain text into HTML with links. 

309 

310 For example: ``linkify("Hello http://tornadoweb.org!")`` would return 

311 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!`` 

312 

313 Parameters: 

314 

315 * ``shorten``: Long urls will be shortened for display. 

316 

317 * ``extra_params``: Extra text to include in the link tag, or a callable 

318 taking the link as an argument and returning the extra text 

319 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, 

320 or:: 

321 

322 def extra_params_cb(url): 

323 if url.startswith("http://example.com"): 

324 return 'class="internal"' 

325 else: 

326 return 'class="external" rel="nofollow"' 

327 linkify(text, extra_params=extra_params_cb) 

328 

329 * ``require_protocol``: Only linkify urls which include a protocol. If 

330 this is False, urls such as www.facebook.com will also be linkified. 

331 

332 * ``permitted_protocols``: List (or set) of protocols which should be 

333 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp", 

334 "mailto"])``. It is very unsafe to include protocols such as 

335 ``javascript``. 

336 """ 

337 if extra_params and not callable(extra_params): 

338 extra_params = " " + extra_params.strip() 

339 

340 def make_link(m: typing.Match) -> str: 

341 url = m.group(1) 

342 proto = m.group(2) 

343 if require_protocol and not proto: 

344 return url # not protocol, no linkify 

345 

346 if proto and proto not in permitted_protocols: 

347 return url # bad protocol, no linkify 

348 

349 href = m.group(1) 

350 if not proto: 

351 href = "http://" + href # no proto specified, use http 

352 

353 if callable(extra_params): 

354 params = " " + extra_params(href).strip() 

355 else: 

356 params = extra_params 

357 

358 # clip long urls. max_len is just an approximation 

359 max_len = 30 

360 if shorten and len(url) > max_len: 

361 before_clip = url 

362 if proto: 

363 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : 

364 else: 

365 proto_len = 0 

366 

367 parts = url[proto_len:].split("/") 

368 if len(parts) > 1: 

369 # Grab the whole host part plus the first bit of the path 

370 # The path is usually not that interesting once shortened 

371 # (no more slug, etc), so it really just provides a little 

372 # extra indication of shortening. 

373 url = ( 

374 url[:proto_len] 

375 + parts[0] 

376 + "/" 

377 + parts[1][:8].split("?")[0].split(".")[0] 

378 ) 

379 

380 if len(url) > max_len * 1.5: # still too long 

381 url = url[:max_len] 

382 

383 if url != before_clip: 

384 amp = url.rfind("&") 

385 # avoid splitting html char entities 

386 if amp > max_len - 5: 

387 url = url[:amp] 

388 url += "..." 

389 

390 if len(url) >= len(before_clip): 

391 url = before_clip 

392 else: 

393 # full url is visible on mouse-over (for those who don't 

394 # have a status bar, such as Safari by default) 

395 params += ' title="%s"' % href 

396 

397 return '<a href="%s"%s>%s</a>' % (href, params, url) 

398 

399 # First HTML-escape so that our strings are all safe. 

400 # The regex is modified to avoid character entites other than &amp; so 

401 # that we won't pick up &quot;, etc. 

402 text = _unicode(xhtml_escape(text)) 

403 return _URL_RE.sub(make_link, text)