Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tornado/escape.py: 36%
146 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1#
2# Copyright 2009 Facebook
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may
5# not use this file except in compliance with the License. You may obtain
6# a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations
14# under the License.
16"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
18Also includes a few other miscellaneous string manipulation functions that
19have crept in over time.
20"""
22import html.entities
23import json
24import re
25import urllib.parse
27from tornado.util import unicode_type
29import typing
30from typing import Union, Any, Optional, Dict, List, Callable
33_XHTML_ESCAPE_RE = re.compile("[&<>\"']")
34_XHTML_ESCAPE_DICT = {
35 "&": "&",
36 "<": "<",
37 ">": ">",
38 '"': """,
39 "'": "'",
40}
43def xhtml_escape(value: Union[str, bytes]) -> str:
44 """Escapes a string so it is valid within HTML or XML.
46 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
47 When used in attribute values the escaped strings must be enclosed
48 in quotes.
50 .. versionchanged:: 3.2
52 Added the single quote to the list of escaped characters.
53 """
54 return _XHTML_ESCAPE_RE.sub(
55 lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value)
56 )
59def xhtml_unescape(value: Union[str, bytes]) -> str:
60 """Un-escapes an XML-escaped string."""
61 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
64# The fact that json_encode wraps json.dumps is an implementation detail.
65# Please see https://github.com/tornadoweb/tornado/pull/706
66# before sending a pull request that adds **kwargs to this function.
67def json_encode(value: Any) -> str:
68 """JSON-encodes the given Python object."""
69 # JSON permits but does not require forward slashes to be escaped.
70 # This is useful when json data is emitted in a <script> tag
71 # in HTML, as it prevents </script> tags from prematurely terminating
72 # the JavaScript. Some json libraries do this escaping by default,
73 # although python's standard library does not, so we do it here.
74 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
75 return json.dumps(value).replace("</", "<\\/")
78def json_decode(value: Union[str, bytes]) -> Any:
79 """Returns Python objects for the given JSON string.
81 Supports both `str` and `bytes` inputs.
82 """
83 return json.loads(to_basestring(value))
86def squeeze(value: str) -> str:
87 """Replace all sequences of whitespace chars with a single space."""
88 return re.sub(r"[\x00-\x20]+", " ", value).strip()
91def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
92 """Returns a URL-encoded version of the given value.
94 If ``plus`` is true (the default), spaces will be represented
95 as "+" instead of "%20". This is appropriate for query strings
96 but not for the path component of a URL. Note that this default
97 is the reverse of Python's urllib module.
99 .. versionadded:: 3.1
100 The ``plus`` argument
101 """
102 quote = urllib.parse.quote_plus if plus else urllib.parse.quote
103 return quote(utf8(value))
106@typing.overload
107def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:
108 pass
111@typing.overload # noqa: F811
112def url_unescape(
113 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True
114) -> str:
115 pass
118def url_unescape( # noqa: F811
119 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True
120) -> Union[str, bytes]:
121 """Decodes the given value from a URL.
123 The argument may be either a byte or unicode string.
125 If encoding is None, the result will be a byte string. Otherwise,
126 the result is a unicode string in the specified encoding.
128 If ``plus`` is true (the default), plus signs will be interpreted
129 as spaces (literal plus signs must be represented as "%2B"). This
130 is appropriate for query strings and form-encoded values but not
131 for the path component of a URL. Note that this default is the
132 reverse of Python's urllib module.
134 .. versionadded:: 3.1
135 The ``plus`` argument
136 """
137 if encoding is None:
138 if plus:
139 # unquote_to_bytes doesn't have a _plus variant
140 value = to_basestring(value).replace("+", " ")
141 return urllib.parse.unquote_to_bytes(value)
142 else:
143 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote
144 return unquote(to_basestring(value), encoding=encoding)
147def parse_qs_bytes(
148 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False
149) -> Dict[str, List[bytes]]:
150 """Parses a query string like urlparse.parse_qs,
151 but takes bytes and returns the values as byte strings.
153 Keys still become type str (interpreted as latin1 in python3!)
154 because it's too painful to keep them as byte strings in
155 python3 and in practice they're nearly always ascii anyway.
156 """
157 # This is gross, but python3 doesn't give us another way.
158 # Latin1 is the universal donor of character encodings.
159 if isinstance(qs, bytes):
160 qs = qs.decode("latin1")
161 result = urllib.parse.parse_qs(
162 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"
163 )
164 encoded = {}
165 for k, v in result.items():
166 encoded[k] = [i.encode("latin1") for i in v]
167 return encoded
170_UTF8_TYPES = (bytes, type(None))
173@typing.overload
174def utf8(value: bytes) -> bytes:
175 pass
178@typing.overload # noqa: F811
179def utf8(value: str) -> bytes:
180 pass
183@typing.overload # noqa: F811
184def utf8(value: None) -> None:
185 pass
188def utf8(value: Union[None, str, bytes]) -> Optional[bytes]: # noqa: F811
189 """Converts a string argument to a byte string.
191 If the argument is already a byte string or None, it is returned unchanged.
192 Otherwise it must be a unicode string and is encoded as utf8.
193 """
194 if isinstance(value, _UTF8_TYPES):
195 return value
196 if not isinstance(value, unicode_type):
197 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
198 return value.encode("utf-8")
201_TO_UNICODE_TYPES = (unicode_type, type(None))
204@typing.overload
205def to_unicode(value: str) -> str:
206 pass
209@typing.overload # noqa: F811
210def to_unicode(value: bytes) -> str:
211 pass
214@typing.overload # noqa: F811
215def to_unicode(value: None) -> None:
216 pass
219def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811
220 """Converts a string argument to a unicode string.
222 If the argument is already a unicode string or None, it is returned
223 unchanged. Otherwise it must be a byte string and is decoded as utf8.
224 """
225 if isinstance(value, _TO_UNICODE_TYPES):
226 return value
227 if not isinstance(value, bytes):
228 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
229 return value.decode("utf-8")
232# to_unicode was previously named _unicode not because it was private,
233# but to avoid conflicts with the built-in unicode() function/type
234_unicode = to_unicode
236# When dealing with the standard library across python 2 and 3 it is
237# sometimes useful to have a direct conversion to the native string type
238native_str = to_unicode
239to_basestring = to_unicode
242def recursive_unicode(obj: Any) -> Any:
243 """Walks a simple data structure, converting byte strings to unicode.
245 Supports lists, tuples, and dictionaries.
246 """
247 if isinstance(obj, dict):
248 return dict(
249 (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items()
250 )
251 elif isinstance(obj, list):
252 return list(recursive_unicode(i) for i in obj)
253 elif isinstance(obj, tuple):
254 return tuple(recursive_unicode(i) for i in obj)
255 elif isinstance(obj, bytes):
256 return to_unicode(obj)
257 else:
258 return obj
261# I originally used the regex from
262# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
263# but it gets all exponential on certain patterns (such as too many trailing
264# dots), causing the regex matcher to never return.
265# This regex should avoid those problems.
266# Use to_unicode instead of tornado.util.u - we don't want backslashes getting
267# processed as escapes.
268_URL_RE = re.compile(
269 to_unicode(
270 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""" # noqa: E501
271 )
272)
275def linkify(
276 text: Union[str, bytes],
277 shorten: bool = False,
278 extra_params: Union[str, Callable[[str], str]] = "",
279 require_protocol: bool = False,
280 permitted_protocols: List[str] = ["http", "https"],
281) -> str:
282 """Converts plain text into HTML with links.
284 For example: ``linkify("Hello http://tornadoweb.org!")`` would return
285 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
287 Parameters:
289 * ``shorten``: Long urls will be shortened for display.
291 * ``extra_params``: Extra text to include in the link tag, or a callable
292 taking the link as an argument and returning the extra text
293 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
294 or::
296 def extra_params_cb(url):
297 if url.startswith("http://example.com"):
298 return 'class="internal"'
299 else:
300 return 'class="external" rel="nofollow"'
301 linkify(text, extra_params=extra_params_cb)
303 * ``require_protocol``: Only linkify urls which include a protocol. If
304 this is False, urls such as www.facebook.com will also be linkified.
306 * ``permitted_protocols``: List (or set) of protocols which should be
307 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
308 "mailto"])``. It is very unsafe to include protocols such as
309 ``javascript``.
310 """
311 if extra_params and not callable(extra_params):
312 extra_params = " " + extra_params.strip()
314 def make_link(m: typing.Match) -> str:
315 url = m.group(1)
316 proto = m.group(2)
317 if require_protocol and not proto:
318 return url # not protocol, no linkify
320 if proto and proto not in permitted_protocols:
321 return url # bad protocol, no linkify
323 href = m.group(1)
324 if not proto:
325 href = "http://" + href # no proto specified, use http
327 if callable(extra_params):
328 params = " " + extra_params(href).strip()
329 else:
330 params = extra_params
332 # clip long urls. max_len is just an approximation
333 max_len = 30
334 if shorten and len(url) > max_len:
335 before_clip = url
336 if proto:
337 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
338 else:
339 proto_len = 0
341 parts = url[proto_len:].split("/")
342 if len(parts) > 1:
343 # Grab the whole host part plus the first bit of the path
344 # The path is usually not that interesting once shortened
345 # (no more slug, etc), so it really just provides a little
346 # extra indication of shortening.
347 url = (
348 url[:proto_len]
349 + parts[0]
350 + "/"
351 + parts[1][:8].split("?")[0].split(".")[0]
352 )
354 if len(url) > max_len * 1.5: # still too long
355 url = url[:max_len]
357 if url != before_clip:
358 amp = url.rfind("&")
359 # avoid splitting html char entities
360 if amp > max_len - 5:
361 url = url[:amp]
362 url += "..."
364 if len(url) >= len(before_clip):
365 url = before_clip
366 else:
367 # full url is visible on mouse-over (for those who don't
368 # have a status bar, such as Safari by default)
369 params += ' title="%s"' % href
371 return '<a href="%s"%s>%s</a>' % (href, params, url)
373 # First HTML-escape so that our strings are all safe.
374 # The regex is modified to avoid character entites other than & so
375 # that we won't pick up ", etc.
376 text = _unicode(xhtml_escape(text))
377 return _URL_RE.sub(make_link, text)
380def _convert_entity(m: typing.Match) -> str:
381 if m.group(1) == "#":
382 try:
383 if m.group(2)[:1].lower() == "x":
384 return chr(int(m.group(2)[1:], 16))
385 else:
386 return chr(int(m.group(2)))
387 except ValueError:
388 return "&#%s;" % m.group(2)
389 try:
390 return _HTML_UNICODE_MAP[m.group(2)]
391 except KeyError:
392 return "&%s;" % m.group(2)
395def _build_unicode_map() -> Dict[str, str]:
396 unicode_map = {}
397 for name, value in html.entities.name2codepoint.items():
398 unicode_map[name] = chr(value)
399 return unicode_map
402_HTML_UNICODE_MAP = _build_unicode_map()