Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tornado/escape.py: 36%

146 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# 

2# Copyright 2009 Facebook 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); you may 

5# not use this file except in compliance with the License. You may obtain 

6# a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

13# License for the specific language governing permissions and limitations 

14# under the License. 

15 

16"""Escaping/unescaping methods for HTML, JSON, URLs, and others. 

17 

18Also includes a few other miscellaneous string manipulation functions that 

19have crept in over time. 

20""" 

21 

22import html.entities 

23import json 

24import re 

25import urllib.parse 

26 

27from tornado.util import unicode_type 

28 

29import typing 

30from typing import Union, Any, Optional, Dict, List, Callable 

31 

32 

33_XHTML_ESCAPE_RE = re.compile("[&<>\"']") 

34_XHTML_ESCAPE_DICT = { 

35 "&": "&amp;", 

36 "<": "&lt;", 

37 ">": "&gt;", 

38 '"': "&quot;", 

39 "'": "&#39;", 

40} 

41 

42 

43def xhtml_escape(value: Union[str, bytes]) -> str: 

44 """Escapes a string so it is valid within HTML or XML. 

45 

46 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``. 

47 When used in attribute values the escaped strings must be enclosed 

48 in quotes. 

49 

50 .. versionchanged:: 3.2 

51 

52 Added the single quote to the list of escaped characters. 

53 """ 

54 return _XHTML_ESCAPE_RE.sub( 

55 lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value) 

56 ) 

57 

58 

59def xhtml_unescape(value: Union[str, bytes]) -> str: 

60 """Un-escapes an XML-escaped string.""" 

61 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value)) 

62 

63 

64# The fact that json_encode wraps json.dumps is an implementation detail. 

65# Please see https://github.com/tornadoweb/tornado/pull/706 

66# before sending a pull request that adds **kwargs to this function. 

67def json_encode(value: Any) -> str: 

68 """JSON-encodes the given Python object.""" 

69 # JSON permits but does not require forward slashes to be escaped. 

70 # This is useful when json data is emitted in a <script> tag 

71 # in HTML, as it prevents </script> tags from prematurely terminating 

72 # the JavaScript. Some json libraries do this escaping by default, 

73 # although python's standard library does not, so we do it here. 

74 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped 

75 return json.dumps(value).replace("</", "<\\/") 

76 

77 

78def json_decode(value: Union[str, bytes]) -> Any: 

79 """Returns Python objects for the given JSON string. 

80 

81 Supports both `str` and `bytes` inputs. 

82 """ 

83 return json.loads(to_basestring(value)) 

84 

85 

86def squeeze(value: str) -> str: 

87 """Replace all sequences of whitespace chars with a single space.""" 

88 return re.sub(r"[\x00-\x20]+", " ", value).strip() 

89 

90 

91def url_escape(value: Union[str, bytes], plus: bool = True) -> str: 

92 """Returns a URL-encoded version of the given value. 

93 

94 If ``plus`` is true (the default), spaces will be represented 

95 as "+" instead of "%20". This is appropriate for query strings 

96 but not for the path component of a URL. Note that this default 

97 is the reverse of Python's urllib module. 

98 

99 .. versionadded:: 3.1 

100 The ``plus`` argument 

101 """ 

102 quote = urllib.parse.quote_plus if plus else urllib.parse.quote 

103 return quote(utf8(value)) 

104 

105 

106@typing.overload 

107def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes: 

108 pass 

109 

110 

111@typing.overload # noqa: F811 

112def url_unescape( 

113 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True 

114) -> str: 

115 pass 

116 

117 

118def url_unescape( # noqa: F811 

119 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True 

120) -> Union[str, bytes]: 

121 """Decodes the given value from a URL. 

122 

123 The argument may be either a byte or unicode string. 

124 

125 If encoding is None, the result will be a byte string. Otherwise, 

126 the result is a unicode string in the specified encoding. 

127 

128 If ``plus`` is true (the default), plus signs will be interpreted 

129 as spaces (literal plus signs must be represented as "%2B"). This 

130 is appropriate for query strings and form-encoded values but not 

131 for the path component of a URL. Note that this default is the 

132 reverse of Python's urllib module. 

133 

134 .. versionadded:: 3.1 

135 The ``plus`` argument 

136 """ 

137 if encoding is None: 

138 if plus: 

139 # unquote_to_bytes doesn't have a _plus variant 

140 value = to_basestring(value).replace("+", " ") 

141 return urllib.parse.unquote_to_bytes(value) 

142 else: 

143 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote 

144 return unquote(to_basestring(value), encoding=encoding) 

145 

146 

147def parse_qs_bytes( 

148 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False 

149) -> Dict[str, List[bytes]]: 

150 """Parses a query string like urlparse.parse_qs, 

151 but takes bytes and returns the values as byte strings. 

152 

153 Keys still become type str (interpreted as latin1 in python3!) 

154 because it's too painful to keep them as byte strings in 

155 python3 and in practice they're nearly always ascii anyway. 

156 """ 

157 # This is gross, but python3 doesn't give us another way. 

158 # Latin1 is the universal donor of character encodings. 

159 if isinstance(qs, bytes): 

160 qs = qs.decode("latin1") 

161 result = urllib.parse.parse_qs( 

162 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict" 

163 ) 

164 encoded = {} 

165 for k, v in result.items(): 

166 encoded[k] = [i.encode("latin1") for i in v] 

167 return encoded 

168 

169 

170_UTF8_TYPES = (bytes, type(None)) 

171 

172 

173@typing.overload 

174def utf8(value: bytes) -> bytes: 

175 pass 

176 

177 

178@typing.overload # noqa: F811 

179def utf8(value: str) -> bytes: 

180 pass 

181 

182 

183@typing.overload # noqa: F811 

184def utf8(value: None) -> None: 

185 pass 

186 

187 

188def utf8(value: Union[None, str, bytes]) -> Optional[bytes]: # noqa: F811 

189 """Converts a string argument to a byte string. 

190 

191 If the argument is already a byte string or None, it is returned unchanged. 

192 Otherwise it must be a unicode string and is encoded as utf8. 

193 """ 

194 if isinstance(value, _UTF8_TYPES): 

195 return value 

196 if not isinstance(value, unicode_type): 

197 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 

198 return value.encode("utf-8") 

199 

200 

201_TO_UNICODE_TYPES = (unicode_type, type(None)) 

202 

203 

204@typing.overload 

205def to_unicode(value: str) -> str: 

206 pass 

207 

208 

209@typing.overload # noqa: F811 

210def to_unicode(value: bytes) -> str: 

211 pass 

212 

213 

214@typing.overload # noqa: F811 

215def to_unicode(value: None) -> None: 

216 pass 

217 

218 

219def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811 

220 """Converts a string argument to a unicode string. 

221 

222 If the argument is already a unicode string or None, it is returned 

223 unchanged. Otherwise it must be a byte string and is decoded as utf8. 

224 """ 

225 if isinstance(value, _TO_UNICODE_TYPES): 

226 return value 

227 if not isinstance(value, bytes): 

228 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 

229 return value.decode("utf-8") 

230 

231 

232# to_unicode was previously named _unicode not because it was private, 

233# but to avoid conflicts with the built-in unicode() function/type 

234_unicode = to_unicode 

235 

236# When dealing with the standard library across python 2 and 3 it is 

237# sometimes useful to have a direct conversion to the native string type 

238native_str = to_unicode 

239to_basestring = to_unicode 

240 

241 

242def recursive_unicode(obj: Any) -> Any: 

243 """Walks a simple data structure, converting byte strings to unicode. 

244 

245 Supports lists, tuples, and dictionaries. 

246 """ 

247 if isinstance(obj, dict): 

248 return dict( 

249 (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items() 

250 ) 

251 elif isinstance(obj, list): 

252 return list(recursive_unicode(i) for i in obj) 

253 elif isinstance(obj, tuple): 

254 return tuple(recursive_unicode(i) for i in obj) 

255 elif isinstance(obj, bytes): 

256 return to_unicode(obj) 

257 else: 

258 return obj 

259 

260 

261# I originally used the regex from 

262# http://daringfireball.net/2010/07/improved_regex_for_matching_urls 

263# but it gets all exponential on certain patterns (such as too many trailing 

264# dots), causing the regex matcher to never return. 

265# This regex should avoid those problems. 

266# Use to_unicode instead of tornado.util.u - we don't want backslashes getting 

267# processed as escapes. 

268_URL_RE = re.compile( 

269 to_unicode( 

270 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""" # noqa: E501 

271 ) 

272) 

273 

274 

275def linkify( 

276 text: Union[str, bytes], 

277 shorten: bool = False, 

278 extra_params: Union[str, Callable[[str], str]] = "", 

279 require_protocol: bool = False, 

280 permitted_protocols: List[str] = ["http", "https"], 

281) -> str: 

282 """Converts plain text into HTML with links. 

283 

284 For example: ``linkify("Hello http://tornadoweb.org!")`` would return 

285 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!`` 

286 

287 Parameters: 

288 

289 * ``shorten``: Long urls will be shortened for display. 

290 

291 * ``extra_params``: Extra text to include in the link tag, or a callable 

292 taking the link as an argument and returning the extra text 

293 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, 

294 or:: 

295 

296 def extra_params_cb(url): 

297 if url.startswith("http://example.com"): 

298 return 'class="internal"' 

299 else: 

300 return 'class="external" rel="nofollow"' 

301 linkify(text, extra_params=extra_params_cb) 

302 

303 * ``require_protocol``: Only linkify urls which include a protocol. If 

304 this is False, urls such as www.facebook.com will also be linkified. 

305 

306 * ``permitted_protocols``: List (or set) of protocols which should be 

307 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp", 

308 "mailto"])``. It is very unsafe to include protocols such as 

309 ``javascript``. 

310 """ 

311 if extra_params and not callable(extra_params): 

312 extra_params = " " + extra_params.strip() 

313 

314 def make_link(m: typing.Match) -> str: 

315 url = m.group(1) 

316 proto = m.group(2) 

317 if require_protocol and not proto: 

318 return url # not protocol, no linkify 

319 

320 if proto and proto not in permitted_protocols: 

321 return url # bad protocol, no linkify 

322 

323 href = m.group(1) 

324 if not proto: 

325 href = "http://" + href # no proto specified, use http 

326 

327 if callable(extra_params): 

328 params = " " + extra_params(href).strip() 

329 else: 

330 params = extra_params 

331 

332 # clip long urls. max_len is just an approximation 

333 max_len = 30 

334 if shorten and len(url) > max_len: 

335 before_clip = url 

336 if proto: 

337 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : 

338 else: 

339 proto_len = 0 

340 

341 parts = url[proto_len:].split("/") 

342 if len(parts) > 1: 

343 # Grab the whole host part plus the first bit of the path 

344 # The path is usually not that interesting once shortened 

345 # (no more slug, etc), so it really just provides a little 

346 # extra indication of shortening. 

347 url = ( 

348 url[:proto_len] 

349 + parts[0] 

350 + "/" 

351 + parts[1][:8].split("?")[0].split(".")[0] 

352 ) 

353 

354 if len(url) > max_len * 1.5: # still too long 

355 url = url[:max_len] 

356 

357 if url != before_clip: 

358 amp = url.rfind("&") 

359 # avoid splitting html char entities 

360 if amp > max_len - 5: 

361 url = url[:amp] 

362 url += "..." 

363 

364 if len(url) >= len(before_clip): 

365 url = before_clip 

366 else: 

367 # full url is visible on mouse-over (for those who don't 

368 # have a status bar, such as Safari by default) 

369 params += ' title="%s"' % href 

370 

371 return '<a href="%s"%s>%s</a>' % (href, params, url) 

372 

373 # First HTML-escape so that our strings are all safe. 

374 # The regex is modified to avoid character entites other than &amp; so 

375 # that we won't pick up &quot;, etc. 

376 text = _unicode(xhtml_escape(text)) 

377 return _URL_RE.sub(make_link, text) 

378 

379 

380def _convert_entity(m: typing.Match) -> str: 

381 if m.group(1) == "#": 

382 try: 

383 if m.group(2)[:1].lower() == "x": 

384 return chr(int(m.group(2)[1:], 16)) 

385 else: 

386 return chr(int(m.group(2))) 

387 except ValueError: 

388 return "&#%s;" % m.group(2) 

389 try: 

390 return _HTML_UNICODE_MAP[m.group(2)] 

391 except KeyError: 

392 return "&%s;" % m.group(2) 

393 

394 

395def _build_unicode_map() -> Dict[str, str]: 

396 unicode_map = {} 

397 for name, value in html.entities.name2codepoint.items(): 

398 unicode_map[name] = chr(value) 

399 return unicode_map 

400 

401 

402_HTML_UNICODE_MAP = _build_unicode_map()