Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/asn1crypto/_iri.py: 11%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# coding: utf-8
3"""
4Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
5the following items:
7 - iri_to_uri()
8 - uri_to_iri()
9"""
11from __future__ import unicode_literals, division, absolute_import, print_function
13from encodings import idna # noqa
14import codecs
15import re
16import sys
18from ._errors import unwrap
19from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
21if sys.version_info < (3,):
22 from urlparse import urlsplit, urlunsplit
23 from urllib import (
24 quote as urlquote,
25 unquote as unquote_to_bytes,
26 )
28else:
29 from urllib.parse import (
30 quote as urlquote,
31 unquote_to_bytes,
32 urlsplit,
33 urlunsplit,
34 )
37def iri_to_uri(value, normalize=False):
38 """
39 Encodes a unicode IRI into an ASCII byte string URI
41 :param value:
42 A unicode string of an IRI
44 :param normalize:
45 A bool that controls URI normalization
47 :return:
48 A byte string of the ASCII-encoded URI
49 """
51 if not isinstance(value, str_cls):
52 raise TypeError(unwrap(
53 '''
54 value must be a unicode string, not %s
55 ''',
56 type_name(value)
57 ))
59 scheme = None
60 # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
61 if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
62 real_prefix = None
63 prefix_match = re.match('^[^:]*://', value)
64 if prefix_match:
65 real_prefix = prefix_match.group(0)
66 value = 'http://' + value[len(real_prefix):]
67 parsed = urlsplit(value)
68 if real_prefix:
69 value = real_prefix + value[7:]
70 scheme = _urlquote(real_prefix[:-3])
71 else:
72 parsed = urlsplit(value)
74 if scheme is None:
75 scheme = _urlquote(parsed.scheme)
76 hostname = parsed.hostname
77 if hostname is not None:
78 hostname = hostname.encode('idna')
79 # RFC 3986 allows userinfo to contain sub-delims
80 username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
81 password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
82 port = parsed.port
83 if port is not None:
84 port = str_cls(port).encode('ascii')
86 netloc = b''
87 if username is not None:
88 netloc += username
89 if password:
90 netloc += b':' + password
91 netloc += b'@'
92 if hostname is not None:
93 netloc += hostname
94 if port is not None:
95 default_http = scheme == b'http' and port == b'80'
96 default_https = scheme == b'https' and port == b'443'
97 if not normalize or (not default_http and not default_https):
98 netloc += b':' + port
100 # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
101 path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
102 # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
103 query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
104 # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
105 fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
107 if normalize and query is None and fragment is None and path == b'/':
108 path = None
110 # Python 2.7 compat
111 if path is None:
112 path = ''
114 output = urlunsplit((scheme, netloc, path, query, fragment))
115 if isinstance(output, str_cls):
116 output = output.encode('latin1')
117 return output
120def uri_to_iri(value):
121 """
122 Converts an ASCII URI byte string into a unicode IRI
124 :param value:
125 An ASCII-encoded byte string of the URI
127 :return:
128 A unicode string of the IRI
129 """
131 if not isinstance(value, byte_cls):
132 raise TypeError(unwrap(
133 '''
134 value must be a byte string, not %s
135 ''',
136 type_name(value)
137 ))
139 parsed = urlsplit(value)
141 scheme = parsed.scheme
142 if scheme is not None:
143 scheme = scheme.decode('ascii')
145 username = _urlunquote(parsed.username, remap=[':', '@'])
146 password = _urlunquote(parsed.password, remap=[':', '@'])
147 hostname = parsed.hostname
148 if hostname:
149 hostname = hostname.decode('idna')
150 port = parsed.port
151 if port and not isinstance(port, int_types):
152 port = port.decode('ascii')
154 netloc = ''
155 if username is not None:
156 netloc += username
157 if password:
158 netloc += ':' + password
159 netloc += '@'
160 if hostname is not None:
161 netloc += hostname
162 if port is not None:
163 netloc += ':' + str_cls(port)
165 path = _urlunquote(parsed.path, remap=['/'], preserve=True)
166 query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
167 fragment = _urlunquote(parsed.fragment)
169 return urlunsplit((scheme, netloc, path, query, fragment))
172def _iri_utf8_errors_handler(exc):
173 """
174 Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
175 sequences encoded in %XX format, but as part of a unicode string.
177 :param exc:
178 The UnicodeDecodeError exception
180 :return:
181 A 2-element tuple of (replacement unicode string, integer index to
182 resume at)
183 """
185 bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
186 replacements = ['%%%02x' % num for num in bytes_as_ints]
187 return (''.join(replacements), exc.end)
190codecs.register_error('iriutf8', _iri_utf8_errors_handler)
193def _urlquote(string, safe=''):
194 """
195 Quotes a unicode string for use in a URL
197 :param string:
198 A unicode string
200 :param safe:
201 A unicode string of character to not encode
203 :return:
204 None (if string is None) or an ASCII byte string of the quoted string
205 """
207 if string is None or string == '':
208 return None
210 # Anything already hex quoted is pulled out of the URL and unquoted if
211 # possible
212 escapes = []
213 if re.search('%[0-9a-fA-F]{2}', string):
214 # Try to unquote any percent values, restoring them if they are not
215 # valid UTF-8. Also, requote any safe chars since encoded versions of
216 # those are functionally different than the unquoted ones.
217 def _try_unescape(match):
218 byte_string = unquote_to_bytes(match.group(0))
219 unicode_string = byte_string.decode('utf-8', 'iriutf8')
220 for safe_char in list(safe):
221 unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
222 return unicode_string
223 string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
225 # Once we have the minimal set of hex quoted values, removed them from
226 # the string so that they are not double quoted
227 def _extract_escape(match):
228 escapes.append(match.group(0).encode('ascii'))
229 return '\x00'
230 string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
232 output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
233 if not isinstance(output, byte_cls):
234 output = output.encode('ascii')
236 # Restore the existing quoted values that we extracted
237 if len(escapes) > 0:
238 def _return_escape(_):
239 return escapes.pop(0)
240 output = re.sub(b'%00', _return_escape, output)
242 return output
245def _urlunquote(byte_string, remap=None, preserve=None):
246 """
247 Unquotes a URI portion from a byte string into unicode using UTF-8
249 :param byte_string:
250 A byte string of the data to unquote
252 :param remap:
253 A list of characters (as unicode) that should be re-mapped to a
254 %XX encoding. This is used when characters are not valid in part of a
255 URL.
257 :param preserve:
258 A bool - indicates that the chars to be remapped if they occur in
259 non-hex form, should be preserved. E.g. / for URL path.
261 :return:
262 A unicode string
263 """
265 if byte_string is None:
266 return byte_string
268 if byte_string == b'':
269 return ''
271 if preserve:
272 replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
273 preserve_unmap = {}
274 for char in remap:
275 replacement = replacements.pop(0)
276 preserve_unmap[replacement] = char
277 byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
279 byte_string = unquote_to_bytes(byte_string)
281 if remap:
282 for char in remap:
283 byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
285 output = byte_string.decode('utf-8', 'iriutf8')
287 if preserve:
288 for replacement, original in preserve_unmap.items():
289 output = output.replace(replacement, original)
291 return output