Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/urllib3/util/url.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

203 statements  

1from __future__ import annotations 

2 

3import re 

4import typing 

5 

6from ..exceptions import LocationParseError 

7from .util import to_str 

8 

9# We only want to normalize urls with an HTTP(S) scheme. 

10# urllib3 infers URLs without a scheme (None) to be http. 

11_NORMALIZABLE_SCHEMES = ("http", "https", None) 

12 

13# Almost all of these patterns were derived from the 

14# 'rfc3986' module: https://github.com/python-hyper/rfc3986 

15_PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}") 

16_SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)") 

17_URI_RE = re.compile( 

18 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?" 

19 r"(?://([^\\/?#]*))?" 

20 r"([^?#]*)" 

21 r"(?:\?([^#]*))?" 

22 r"(?:#(.*))?$", 

23 re.UNICODE | re.DOTALL, 

24) 

25 

26_IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}" 

27_HEX_PAT = "[0-9A-Fa-f]{1,4}" 

28_LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT) 

29_subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT} 

30_variations = [ 

31 # 6( h16 ":" ) ls32 

32 "(?:%(hex)s:){6}%(ls32)s", 

33 # "::" 5( h16 ":" ) ls32 

34 "::(?:%(hex)s:){5}%(ls32)s", 

35 # [ h16 ] "::" 4( h16 ":" ) ls32 

36 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s", 

37 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 

38 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s", 

39 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 

40 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s", 

41 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 

42 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s", 

43 # [ *4( h16 ":" ) h16 ] "::" ls32 

44 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s", 

45 # [ *5( h16 ":" ) h16 ] "::" h16 

46 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s", 

47 # [ *6( h16 ":" ) h16 ] "::" 

48 "(?:(?:%(hex)s:){0,6}%(hex)s)?::", 

49] 

50 

51_UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~" 

52_IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")" 

53_ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+" 

54_IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]" 

55_REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*" 

56_TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$") 

57 

58_IPV4_RE = re.compile("^" + _IPV4_PAT + "$") 

59_IPV6_RE = re.compile("^" + _IPV6_PAT + "$") 

60_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$") 

61_BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$") 

62_ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$") 

63 

64_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % ( 

65 _REG_NAME_PAT, 

66 _IPV4_PAT, 

67 _IPV6_ADDRZ_PAT, 

68) 

69_HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL) 

70 

71_UNRESERVED_CHARS = set( 

72 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~" 

73) 

74_SUB_DELIM_CHARS = set("!$&'()*+,;=") 

75_USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"} 

76_PATH_CHARS = _USERINFO_CHARS | {"@", "/"} 

77_QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"} 

78 

79 

80class Url( 

81 typing.NamedTuple( 

82 "Url", 

83 [ 

84 ("scheme", typing.Optional[str]), 

85 ("auth", typing.Optional[str]), 

86 ("host", typing.Optional[str]), 

87 ("port", typing.Optional[int]), 

88 ("path", typing.Optional[str]), 

89 ("query", typing.Optional[str]), 

90 ("fragment", typing.Optional[str]), 

91 ], 

92 ) 

93): 

94 """ 

95 Data structure for representing an HTTP URL. Used as a return value for 

96 :func:`parse_url`. Both the scheme and host are normalized as they are 

97 both case-insensitive according to RFC 3986. 

98 """ 

99 

100 def __new__( # type: ignore[no-untyped-def] 

101 cls, 

102 scheme: str | None = None, 

103 auth: str | None = None, 

104 host: str | None = None, 

105 port: int | None = None, 

106 path: str | None = None, 

107 query: str | None = None, 

108 fragment: str | None = None, 

109 ): 

110 if path and not path.startswith("/"): 

111 path = "/" + path 

112 if scheme is not None: 

113 scheme = scheme.lower() 

114 return super().__new__(cls, scheme, auth, host, port, path, query, fragment) 

115 

116 @property 

117 def hostname(self) -> str | None: 

118 """For backwards-compatibility with urlparse. We're nice like that.""" 

119 return self.host 

120 

121 @property 

122 def request_uri(self) -> str: 

123 """Absolute path including the query string.""" 

124 uri = self.path or "/" 

125 

126 if self.query is not None: 

127 uri += "?" + self.query 

128 

129 return uri 

130 

131 @property 

132 def authority(self) -> str | None: 

133 """ 

134 Authority component as defined in RFC 3986 3.2. 

135 This includes userinfo (auth), host and port. 

136 

137 i.e. 

138 userinfo@host:port 

139 """ 

140 userinfo = self.auth 

141 netloc = self.netloc 

142 if netloc is None or userinfo is None: 

143 return netloc 

144 else: 

145 return f"{userinfo}@{netloc}" 

146 

147 @property 

148 def netloc(self) -> str | None: 

149 """ 

150 Network location including host and port. 

151 

152 If you need the equivalent of urllib.parse's ``netloc``, 

153 use the ``authority`` property instead. 

154 """ 

155 if self.host is None: 

156 return None 

157 if self.port: 

158 return f"{self.host}:{self.port}" 

159 return self.host 

160 

161 @property 

162 def url(self) -> str: 

163 """ 

164 Convert self into a url 

165 

166 This function should more or less round-trip with :func:`.parse_url`. The 

167 returned url may not be exactly the same as the url inputted to 

168 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls 

169 with a blank port will have : removed). 

170 

171 Example: 

172 

173 .. code-block:: python 

174 

175 import urllib3 

176 

177 U = urllib3.util.parse_url("https://google.com/mail/") 

178 

179 print(U.url) 

180 # "https://google.com/mail/" 

181 

182 print( urllib3.util.Url("https", "username:password", 

183 "host.com", 80, "/path", "query", "fragment" 

184 ).url 

185 ) 

186 # "https://username:password@host.com:80/path?query#fragment" 

187 """ 

188 scheme, auth, host, port, path, query, fragment = self 

189 url = "" 

190 

191 # We use "is not None" we want things to happen with empty strings (or 0 port) 

192 if scheme is not None: 

193 url += scheme + "://" 

194 if auth is not None: 

195 url += auth + "@" 

196 if host is not None: 

197 url += host 

198 if port is not None: 

199 url += ":" + str(port) 

200 if path is not None: 

201 url += path 

202 if query is not None: 

203 url += "?" + query 

204 if fragment is not None: 

205 url += "#" + fragment 

206 

207 return url 

208 

209 def __str__(self) -> str: 

210 return self.url 

211 

212 

213@typing.overload 

214def _encode_invalid_chars( 

215 component: str, allowed_chars: typing.Container[str] 

216) -> str: # Abstract 

217 ... 

218 

219 

220@typing.overload 

221def _encode_invalid_chars( 

222 component: None, allowed_chars: typing.Container[str] 

223) -> None: # Abstract 

224 ... 

225 

226 

227def _encode_invalid_chars( 

228 component: str | None, allowed_chars: typing.Container[str] 

229) -> str | None: 

230 """Percent-encodes a URI component without reapplying 

231 onto an already percent-encoded component. 

232 """ 

233 if component is None: 

234 return component 

235 

236 component = to_str(component) 

237 

238 # Normalize existing percent-encoded bytes. 

239 # Try to see if the component we're encoding is already percent-encoded 

240 # so we can skip all '%' characters but still encode all others. 

241 component, percent_encodings = _PERCENT_RE.subn( 

242 lambda match: match.group(0).upper(), component 

243 ) 

244 

245 uri_bytes = component.encode("utf-8", "surrogatepass") 

246 is_percent_encoded = percent_encodings == uri_bytes.count(b"%") 

247 encoded_component = bytearray() 

248 

249 for i in range(0, len(uri_bytes)): 

250 # Will return a single character bytestring 

251 byte = uri_bytes[i : i + 1] 

252 byte_ord = ord(byte) 

253 if (is_percent_encoded and byte == b"%") or ( 

254 byte_ord < 128 and byte.decode() in allowed_chars 

255 ): 

256 encoded_component += byte 

257 continue 

258 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper())) 

259 

260 return encoded_component.decode() 

261 

262 

263def _remove_path_dot_segments(path: str) -> str: 

264 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code 

265 segments = path.split("/") # Turn the path into a list of segments 

266 output = [] # Initialize the variable to use to store output 

267 

268 for segment in segments: 

269 # '.' is the current directory, so ignore it, it is superfluous 

270 if segment == ".": 

271 continue 

272 # Anything other than '..', should be appended to the output 

273 if segment != "..": 

274 output.append(segment) 

275 # In this case segment == '..', if we can, we should pop the last 

276 # element 

277 elif output: 

278 output.pop() 

279 

280 # If the path starts with '/' and the output is empty or the first string 

281 # is non-empty 

282 if path.startswith("/") and (not output or output[0]): 

283 output.insert(0, "") 

284 

285 # If the path starts with '/.' or '/..' ensure we add one more empty 

286 # string to add a trailing '/' 

287 if path.endswith(("/.", "/..")): 

288 output.append("") 

289 

290 return "/".join(output) 

291 

292 

293@typing.overload 

294def _normalize_host(host: None, scheme: str | None) -> None: ... 

295 

296 

297@typing.overload 

298def _normalize_host(host: str, scheme: str | None) -> str: ... 

299 

300 

301def _normalize_host(host: str | None, scheme: str | None) -> str | None: 

302 if host: 

303 if scheme in _NORMALIZABLE_SCHEMES: 

304 is_ipv6 = _IPV6_ADDRZ_RE.match(host) 

305 if is_ipv6: 

306 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as 

307 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID 

308 # separator as necessary to return a valid RFC 4007 scoped IP. 

309 match = _ZONE_ID_RE.search(host) 

310 if match: 

311 start, end = match.span(1) 

312 zone_id = host[start:end] 

313 

314 if zone_id.startswith("%25") and zone_id != "%25": 

315 zone_id = zone_id[3:] 

316 else: 

317 zone_id = zone_id[1:] 

318 zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS) 

319 return f"{host[:start].lower()}%{zone_id}{host[end:]}" 

320 else: 

321 return host.lower() 

322 elif not _IPV4_RE.match(host): 

323 return to_str( 

324 b".".join([_idna_encode(label) for label in host.split(".")]), 

325 "ascii", 

326 ) 

327 return host 

328 

329 

330def _idna_encode(name: str) -> bytes: 

331 if not name.isascii(): 

332 try: 

333 import idna 

334 except ImportError: 

335 raise LocationParseError( 

336 "Unable to parse URL without the 'idna' module" 

337 ) from None 

338 

339 try: 

340 return idna.encode(name.lower(), strict=True, std3_rules=True) 

341 except idna.IDNAError: 

342 raise LocationParseError( 

343 f"Name '{name}' is not a valid IDNA label" 

344 ) from None 

345 

346 return name.lower().encode("ascii") 

347 

348 

349def _encode_target(target: str) -> str: 

350 """Percent-encodes a request target so that there are no invalid characters 

351 

352 Pre-condition for this function is that 'target' must start with '/'. 

353 If that is the case then _TARGET_RE will always produce a match. 

354 """ 

355 match = _TARGET_RE.match(target) 

356 if not match: # Defensive: 

357 raise LocationParseError(f"{target!r} is not a valid request URI") 

358 

359 path, query = match.groups() 

360 encoded_target = _encode_invalid_chars(path, _PATH_CHARS) 

361 if query is not None: 

362 query = _encode_invalid_chars(query, _QUERY_CHARS) 

363 encoded_target += "?" + query 

364 return encoded_target 

365 

366 

367def parse_url(url: str) -> Url: 

368 """ 

369 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is 

370 performed to parse incomplete urls. Fields not provided will be None. 

371 This parser is RFC 3986 and RFC 6874 compliant. 

372 

373 The parser logic and helper functions are based heavily on 

374 work done in the ``rfc3986`` module. 

375 

376 :param str url: URL to parse into a :class:`.Url` namedtuple. 

377 

378 Partly backwards-compatible with :mod:`urllib.parse`. 

379 

380 Example: 

381 

382 .. code-block:: python 

383 

384 import urllib3 

385 

386 print( urllib3.util.parse_url('http://google.com/mail/')) 

387 # Url(scheme='http', host='google.com', port=None, path='/mail/', ...) 

388 

389 print( urllib3.util.parse_url('google.com:80')) 

390 # Url(scheme=None, host='google.com', port=80, path=None, ...) 

391 

392 print( urllib3.util.parse_url('/foo?bar')) 

393 # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) 

394 """ 

395 if not url: 

396 # Empty 

397 return Url() 

398 

399 source_url = url 

400 if not _SCHEME_RE.search(url): 

401 url = "//" + url 

402 

403 scheme: str | None 

404 authority: str | None 

405 auth: str | None 

406 host: str | None 

407 port: str | None 

408 port_int: int | None 

409 path: str | None 

410 query: str | None 

411 fragment: str | None 

412 

413 try: 

414 scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr] 

415 normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES 

416 

417 if scheme: 

418 scheme = scheme.lower() 

419 

420 if authority: 

421 auth, _, host_port = authority.rpartition("@") 

422 auth = auth or None 

423 host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr] 

424 if auth and normalize_uri: 

425 auth = _encode_invalid_chars(auth, _USERINFO_CHARS) 

426 if port == "": 

427 port = None 

428 else: 

429 auth, host, port = None, None, None 

430 

431 if port is not None: 

432 port_int = int(port) 

433 if not (0 <= port_int <= 65535): 

434 raise LocationParseError(url) 

435 else: 

436 port_int = None 

437 

438 host = _normalize_host(host, scheme) 

439 

440 if normalize_uri and path: 

441 path = _remove_path_dot_segments(path) 

442 path = _encode_invalid_chars(path, _PATH_CHARS) 

443 if normalize_uri and query: 

444 query = _encode_invalid_chars(query, _QUERY_CHARS) 

445 if normalize_uri and fragment: 

446 fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS) 

447 

448 except (ValueError, AttributeError) as e: 

449 raise LocationParseError(source_url) from e 

450 

451 # For the sake of backwards compatibility we put empty 

452 # string values for path if there are any defined values 

453 # beyond the path in the URL. 

454 # TODO: Remove this when we break backwards compatibility. 

455 if not path: 

456 if query is not None or fragment is not None: 

457 path = "" 

458 else: 

459 path = None 

460 

461 return Url( 

462 scheme=scheme, 

463 auth=auth, 

464 host=host, 

465 port=port_int, 

466 path=path, 

467 query=query, 

468 fragment=fragment, 

469 )