Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/asn1crypto/_iri.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

141 statements  

1# coding: utf-8 

2 

3""" 

4Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports 

5the following items: 

6 

7 - iri_to_uri() 

8 - uri_to_iri() 

9""" 

10 

11from __future__ import unicode_literals, division, absolute_import, print_function 

12 

13from encodings import idna # noqa 

14import codecs 

15import re 

16import sys 

17 

18from ._errors import unwrap 

19from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types 

20 

21if sys.version_info < (3,): 

22 from urlparse import urlsplit, urlunsplit 

23 from urllib import ( 

24 quote as urlquote, 

25 unquote as unquote_to_bytes, 

26 ) 

27 

28else: 

29 from urllib.parse import ( 

30 quote as urlquote, 

31 unquote_to_bytes, 

32 urlsplit, 

33 urlunsplit, 

34 ) 

35 

36 

37def iri_to_uri(value, normalize=False): 

38 """ 

39 Encodes a unicode IRI into an ASCII byte string URI 

40 

41 :param value: 

42 A unicode string of an IRI 

43 

44 :param normalize: 

45 A bool that controls URI normalization 

46 

47 :return: 

48 A byte string of the ASCII-encoded URI 

49 """ 

50 

51 if not isinstance(value, str_cls): 

52 raise TypeError(unwrap( 

53 ''' 

54 value must be a unicode string, not %s 

55 ''', 

56 type_name(value) 

57 )) 

58 

59 scheme = None 

60 # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https:// 

61 if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'): 

62 real_prefix = None 

63 prefix_match = re.match('^[^:]*://', value) 

64 if prefix_match: 

65 real_prefix = prefix_match.group(0) 

66 value = 'http://' + value[len(real_prefix):] 

67 parsed = urlsplit(value) 

68 if real_prefix: 

69 value = real_prefix + value[7:] 

70 scheme = _urlquote(real_prefix[:-3]) 

71 else: 

72 parsed = urlsplit(value) 

73 

74 if scheme is None: 

75 scheme = _urlquote(parsed.scheme) 

76 hostname = parsed.hostname 

77 if hostname is not None: 

78 hostname = hostname.encode('idna') 

79 # RFC 3986 allows userinfo to contain sub-delims 

80 username = _urlquote(parsed.username, safe='!$&\'()*+,;=') 

81 password = _urlquote(parsed.password, safe='!$&\'()*+,;=') 

82 port = parsed.port 

83 if port is not None: 

84 port = str_cls(port).encode('ascii') 

85 

86 netloc = b'' 

87 if username is not None: 

88 netloc += username 

89 if password: 

90 netloc += b':' + password 

91 netloc += b'@' 

92 if hostname is not None: 

93 netloc += hostname 

94 if port is not None: 

95 default_http = scheme == b'http' and port == b'80' 

96 default_https = scheme == b'https' and port == b'443' 

97 if not normalize or (not default_http and not default_https): 

98 netloc += b':' + port 

99 

100 # RFC 3986 allows a path to contain sub-delims, plus "@" and ":" 

101 path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:') 

102 # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?" 

103 query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:') 

104 # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?" 

105 fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:') 

106 

107 if normalize and query is None and fragment is None and path == b'/': 

108 path = None 

109 

110 # Python 2.7 compat 

111 if path is None: 

112 path = '' 

113 

114 output = urlunsplit((scheme, netloc, path, query, fragment)) 

115 if isinstance(output, str_cls): 

116 output = output.encode('latin1') 

117 return output 

118 

119 

120def uri_to_iri(value): 

121 """ 

122 Converts an ASCII URI byte string into a unicode IRI 

123 

124 :param value: 

125 An ASCII-encoded byte string of the URI 

126 

127 :return: 

128 A unicode string of the IRI 

129 """ 

130 

131 if not isinstance(value, byte_cls): 

132 raise TypeError(unwrap( 

133 ''' 

134 value must be a byte string, not %s 

135 ''', 

136 type_name(value) 

137 )) 

138 

139 parsed = urlsplit(value) 

140 

141 scheme = parsed.scheme 

142 if scheme is not None: 

143 scheme = scheme.decode('ascii') 

144 

145 username = _urlunquote(parsed.username, remap=[':', '@']) 

146 password = _urlunquote(parsed.password, remap=[':', '@']) 

147 hostname = parsed.hostname 

148 if hostname: 

149 hostname = hostname.decode('idna') 

150 port = parsed.port 

151 if port and not isinstance(port, int_types): 

152 port = port.decode('ascii') 

153 

154 netloc = '' 

155 if username is not None: 

156 netloc += username 

157 if password: 

158 netloc += ':' + password 

159 netloc += '@' 

160 if hostname is not None: 

161 netloc += hostname 

162 if port is not None: 

163 netloc += ':' + str_cls(port) 

164 

165 path = _urlunquote(parsed.path, remap=['/'], preserve=True) 

166 query = _urlunquote(parsed.query, remap=['&', '='], preserve=True) 

167 fragment = _urlunquote(parsed.fragment) 

168 

169 return urlunsplit((scheme, netloc, path, query, fragment)) 

170 

171 

172def _iri_utf8_errors_handler(exc): 

173 """ 

174 Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte 

175 sequences encoded in %XX format, but as part of a unicode string. 

176 

177 :param exc: 

178 The UnicodeDecodeError exception 

179 

180 :return: 

181 A 2-element tuple of (replacement unicode string, integer index to 

182 resume at) 

183 """ 

184 

185 bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end]) 

186 replacements = ['%%%02x' % num for num in bytes_as_ints] 

187 return (''.join(replacements), exc.end) 

188 

189 

190codecs.register_error('iriutf8', _iri_utf8_errors_handler) 

191 

192 

193def _urlquote(string, safe=''): 

194 """ 

195 Quotes a unicode string for use in a URL 

196 

197 :param string: 

198 A unicode string 

199 

200 :param safe: 

201 A unicode string of character to not encode 

202 

203 :return: 

204 None (if string is None) or an ASCII byte string of the quoted string 

205 """ 

206 

207 if string is None or string == '': 

208 return None 

209 

210 # Anything already hex quoted is pulled out of the URL and unquoted if 

211 # possible 

212 escapes = [] 

213 if re.search('%[0-9a-fA-F]{2}', string): 

214 # Try to unquote any percent values, restoring them if they are not 

215 # valid UTF-8. Also, requote any safe chars since encoded versions of 

216 # those are functionally different than the unquoted ones. 

217 def _try_unescape(match): 

218 byte_string = unquote_to_bytes(match.group(0)) 

219 unicode_string = byte_string.decode('utf-8', 'iriutf8') 

220 for safe_char in list(safe): 

221 unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char)) 

222 return unicode_string 

223 string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string) 

224 

225 # Once we have the minimal set of hex quoted values, removed them from 

226 # the string so that they are not double quoted 

227 def _extract_escape(match): 

228 escapes.append(match.group(0).encode('ascii')) 

229 return '\x00' 

230 string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string) 

231 

232 output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8')) 

233 if not isinstance(output, byte_cls): 

234 output = output.encode('ascii') 

235 

236 # Restore the existing quoted values that we extracted 

237 if len(escapes) > 0: 

238 def _return_escape(_): 

239 return escapes.pop(0) 

240 output = re.sub(b'%00', _return_escape, output) 

241 

242 return output 

243 

244 

245def _urlunquote(byte_string, remap=None, preserve=None): 

246 """ 

247 Unquotes a URI portion from a byte string into unicode using UTF-8 

248 

249 :param byte_string: 

250 A byte string of the data to unquote 

251 

252 :param remap: 

253 A list of characters (as unicode) that should be re-mapped to a 

254 %XX encoding. This is used when characters are not valid in part of a 

255 URL. 

256 

257 :param preserve: 

258 A bool - indicates that the chars to be remapped if they occur in 

259 non-hex form, should be preserved. E.g. / for URL path. 

260 

261 :return: 

262 A unicode string 

263 """ 

264 

265 if byte_string is None: 

266 return byte_string 

267 

268 if byte_string == b'': 

269 return '' 

270 

271 if preserve: 

272 replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F'] 

273 preserve_unmap = {} 

274 for char in remap: 

275 replacement = replacements.pop(0) 

276 preserve_unmap[replacement] = char 

277 byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii')) 

278 

279 byte_string = unquote_to_bytes(byte_string) 

280 

281 if remap: 

282 for char in remap: 

283 byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii')) 

284 

285 output = byte_string.decode('utf-8', 'iriutf8') 

286 

287 if preserve: 

288 for replacement, original in preserve_unmap.items(): 

289 output = output.replace(replacement, original) 

290 

291 return output