Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/django/utils/encoding.py: 34%
97 statements
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
« prev ^ index » next coverage.py v7.0.5, created at 2023-01-17 06:13 +0000
1import codecs
2import datetime
3import locale
4from decimal import Decimal
5from urllib.parse import quote
7from django.utils.functional import Promise
10class DjangoUnicodeDecodeError(UnicodeDecodeError):
11 def __init__(self, obj, *args):
12 self.obj = obj
13 super().__init__(*args)
15 def __str__(self):
16 return "%s. You passed in %r (%s)" % (
17 super().__str__(),
18 self.obj,
19 type(self.obj),
20 )
23def smart_str(s, encoding="utf-8", strings_only=False, errors="strict"):
24 """
25 Return a string representing 's'. Treat bytestrings using the 'encoding'
26 codec.
28 If strings_only is True, don't convert (some) non-string-like objects.
29 """
30 if isinstance(s, Promise):
31 # The input is the result of a gettext_lazy() call.
32 return s
33 return force_str(s, encoding, strings_only, errors)
36_PROTECTED_TYPES = (
37 type(None),
38 int,
39 float,
40 Decimal,
41 datetime.datetime,
42 datetime.date,
43 datetime.time,
44)
47def is_protected_type(obj):
48 """Determine if the object instance is of a protected type.
50 Objects of protected types are preserved as-is when passed to
51 force_str(strings_only=True).
52 """
53 return isinstance(obj, _PROTECTED_TYPES)
56def force_str(s, encoding="utf-8", strings_only=False, errors="strict"):
57 """
58 Similar to smart_str(), except that lazy instances are resolved to
59 strings, rather than kept as lazy objects.
61 If strings_only is True, don't convert (some) non-string-like objects.
62 """
63 # Handle the common case first for performance reasons.
64 if issubclass(type(s), str):
65 return s
66 if strings_only and is_protected_type(s):
67 return s
68 try:
69 if isinstance(s, bytes):
70 s = str(s, encoding, errors)
71 else:
72 s = str(s)
73 except UnicodeDecodeError as e:
74 raise DjangoUnicodeDecodeError(s, *e.args)
75 return s
78def smart_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
79 """
80 Return a bytestring version of 's', encoded as specified in 'encoding'.
82 If strings_only is True, don't convert (some) non-string-like objects.
83 """
84 if isinstance(s, Promise):
85 # The input is the result of a gettext_lazy() call.
86 return s
87 return force_bytes(s, encoding, strings_only, errors)
90def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
91 """
92 Similar to smart_bytes, except that lazy instances are resolved to
93 strings, rather than kept as lazy objects.
95 If strings_only is True, don't convert (some) non-string-like objects.
96 """
97 # Handle the common case first for performance reasons.
98 if isinstance(s, bytes):
99 if encoding == "utf-8":
100 return s
101 else:
102 return s.decode("utf-8", errors).encode(encoding, errors)
103 if strings_only and is_protected_type(s):
104 return s
105 if isinstance(s, memoryview):
106 return bytes(s)
107 return str(s).encode(encoding, errors)
110def iri_to_uri(iri):
111 """
112 Convert an Internationalized Resource Identifier (IRI) portion to a URI
113 portion that is suitable for inclusion in a URL.
115 This is the algorithm from RFC 3987 Section 3.1, slightly simplified since
116 the input is assumed to be a string rather than an arbitrary byte stream.
118 Take an IRI (string or UTF-8 bytes, e.g. '/I ♥ Django/' or
119 b'/I \xe2\x99\xa5 Django/') and return a string containing the encoded
120 result with ASCII chars only (e.g. '/I%20%E2%99%A5%20Django/').
121 """
122 # The list of safe characters here is constructed from the "reserved" and
123 # "unreserved" characters specified in RFC 3986 Sections 2.2 and 2.3:
124 # reserved = gen-delims / sub-delims
125 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
126 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
127 # / "*" / "+" / "," / ";" / "="
128 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
129 # Of the unreserved characters, urllib.parse.quote() already considers all
130 # but the ~ safe.
131 # The % character is also added to the list of safe characters here, as the
132 # end of RFC 3987 Section 3.1 specifically mentions that % must not be
133 # converted.
134 if iri is None:
135 return iri
136 elif isinstance(iri, Promise):
137 iri = str(iri)
138 return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
141# List of byte values that uri_to_iri() decodes from percent encoding.
142# First, the unreserved characters from RFC 3986:
143_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
144_hextobyte = {
145 (fmt % char).encode(): bytes((char,))
146 for ascii_range in _ascii_ranges
147 for char in ascii_range
148 for fmt in ["%02x", "%02X"]
149}
150# And then everything above 128, because bytes ≥ 128 are part of multibyte
151# Unicode characters.
152_hexdig = "0123456789ABCDEFabcdef"
153_hextobyte.update(
154 {(a + b).encode(): bytes.fromhex(a + b) for a in _hexdig[8:] for b in _hexdig}
155)
158def uri_to_iri(uri):
159 """
160 Convert a Uniform Resource Identifier(URI) into an Internationalized
161 Resource Identifier(IRI).
163 This is the algorithm from RFC 3987 Section 3.2, excluding step 4.
165 Take an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and return
166 a string containing the encoded result (e.g. '/I%20♥%20Django/').
167 """
168 if uri is None:
169 return uri
170 uri = force_bytes(uri)
171 # Fast selective unquote: First, split on '%' and then starting with the
172 # second block, decode the first 2 bytes if they represent a hex code to
173 # decode. The rest of the block is the part after '%AB', not containing
174 # any '%'. Add that to the output without further processing.
175 bits = uri.split(b"%")
176 if len(bits) == 1:
177 iri = uri
178 else:
179 parts = [bits[0]]
180 append = parts.append
181 hextobyte = _hextobyte
182 for item in bits[1:]:
183 hex = item[:2]
184 if hex in hextobyte:
185 append(hextobyte[item[:2]])
186 append(item[2:])
187 else:
188 append(b"%")
189 append(item)
190 iri = b"".join(parts)
191 return repercent_broken_unicode(iri).decode()
194def escape_uri_path(path):
195 """
196 Escape the unsafe characters from the path portion of a Uniform Resource
197 Identifier (URI).
198 """
199 # These are the "reserved" and "unreserved" characters specified in RFC
200 # 3986 Sections 2.2 and 2.3:
201 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
202 # unreserved = alphanum | mark
203 # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
204 # The list of safe characters here is constructed subtracting ";", "=",
205 # and "?" according to RFC 3986 Section 3.3.
206 # The reason for not subtracting and escaping "/" is that we are escaping
207 # the entire path, not a path segment.
208 return quote(path, safe="/:@&+$,-_.!~*'()")
211def punycode(domain):
212 """Return the Punycode of the given domain if it's non-ASCII."""
213 return domain.encode("idna").decode("ascii")
216def repercent_broken_unicode(path):
217 """
218 As per RFC 3987 Section 3.2, step three of converting a URI into an IRI,
219 repercent-encode any octet produced that is not part of a strictly legal
220 UTF-8 octet sequence.
221 """
222 while True:
223 try:
224 path.decode()
225 except UnicodeDecodeError as e:
226 # CVE-2019-14235: A recursion shouldn't be used since the exception
227 # handling uses massive amounts of memory
228 repercent = quote(path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
229 path = path[: e.start] + repercent.encode() + path[e.end :]
230 else:
231 return path
234def filepath_to_uri(path):
235 """Convert a file system path to a URI portion that is suitable for
236 inclusion in a URL.
238 Encode certain chars that would normally be recognized as special chars
239 for URIs. Do not encode the ' character, as it is a valid character
240 within URIs. See the encodeURIComponent() JavaScript function for details.
241 """
242 if path is None:
243 return path
244 # I know about `os.sep` and `os.altsep` but I want to leave
245 # some flexibility for hardcoding separators.
246 return quote(str(path).replace("\\", "/"), safe="/~!*()'")
249def get_system_encoding():
250 """
251 The encoding for the character type functions. Fallback to 'ascii' if the
252 #encoding is unsupported by Python or could not be determined. See tickets
253 #10335 and #5846.
254 """
255 try:
256 encoding = locale.getlocale()[1] or "ascii"
257 codecs.lookup(encoding)
258 except Exception:
259 encoding = "ascii"
260 return encoding
263DEFAULT_LOCALE_ENCODING = get_system_encoding()