Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/urllib3/util/url.py: 92%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

203 statements  

1from __future__ import annotations 

2 

3import re 

4import typing 

5 

6from ..exceptions import LocationParseError 

7from .util import to_str 

8 

9# We only want to normalize urls with an HTTP(S) scheme. 

10# urllib3 infers URLs without a scheme (None) to be http. 

11_NORMALIZABLE_SCHEMES = ("http", "https", None) 

12 

13# Almost all of these patterns were derived from the 

14# 'rfc3986' module: https://github.com/python-hyper/rfc3986 

15_PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}") 

16_SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)") 

17_URI_RE = re.compile( 

18 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?" 

19 r"(?://([^\\/?#]*))?" 

20 r"([^?#]*)" 

21 r"(?:\?([^#]*))?" 

22 r"(?:#(.*))?$", 

23 re.UNICODE | re.DOTALL, 

24) 

25 

26_IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}" 

27_HEX_PAT = "[0-9A-Fa-f]{1,4}" 

28_LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT) 

29_subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT} 

30_variations = [ 

31 # 6( h16 ":" ) ls32 

32 "(?:%(hex)s:){6}%(ls32)s", 

33 # "::" 5( h16 ":" ) ls32 

34 "::(?:%(hex)s:){5}%(ls32)s", 

35 # [ h16 ] "::" 4( h16 ":" ) ls32 

36 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s", 

37 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 

38 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s", 

39 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 

40 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s", 

41 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 

42 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s", 

43 # [ *4( h16 ":" ) h16 ] "::" ls32 

44 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s", 

45 # [ *5( h16 ":" ) h16 ] "::" h16 

46 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s", 

47 # [ *6( h16 ":" ) h16 ] "::" 

48 "(?:(?:%(hex)s:){0,6}%(hex)s)?::", 

49] 

50 

51_UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~" 

52_IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")" 

53_ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+" 

54_IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]" 

55_REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*" 

56_TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$") 

57 

58_IPV4_RE = re.compile( 

59 r"^(?:0[xX][0-9a-fA-F]+|[0-9]+)(?:\.(?:0[xX][0-9a-fA-F]+|[0-9]+)){0,3}$" 

60) 

61_IPV6_RE = re.compile("^" + _IPV6_PAT + "$") 

62_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$") 

63_BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$") 

64_ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$") 

65 

66_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % ( 

67 _REG_NAME_PAT, 

68 _IPV4_PAT, 

69 _IPV6_ADDRZ_PAT, 

70) 

71_HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL) 

72 

73_UNRESERVED_CHARS = set( 

74 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~" 

75) 

76_SUB_DELIM_CHARS = set("!$&'()*+,;=") 

77_USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"} 

78_PATH_CHARS = _USERINFO_CHARS | {"@", "/"} 

79_QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"} 

80 

81 

82class Url( 

83 typing.NamedTuple( 

84 "Url", 

85 [ 

86 ("scheme", typing.Optional[str]), 

87 ("auth", typing.Optional[str]), 

88 ("host", typing.Optional[str]), 

89 ("port", typing.Optional[int]), 

90 ("path", typing.Optional[str]), 

91 ("query", typing.Optional[str]), 

92 ("fragment", typing.Optional[str]), 

93 ], 

94 ) 

95): 

96 """ 

97 Data structure for representing an HTTP URL. Used as a return value for 

98 :func:`parse_url`. Both the scheme and host are normalized as they are 

99 both case-insensitive according to RFC 3986. 

100 """ 

101 

102 def __new__( # type: ignore[no-untyped-def] 

103 cls, 

104 scheme: str | None = None, 

105 auth: str | None = None, 

106 host: str | None = None, 

107 port: int | None = None, 

108 path: str | None = None, 

109 query: str | None = None, 

110 fragment: str | None = None, 

111 ): 

112 if path and not path.startswith("/"): 

113 path = "/" + path 

114 if scheme is not None: 

115 scheme = scheme.lower() 

116 return super().__new__(cls, scheme, auth, host, port, path, query, fragment) 

117 

118 @property 

119 def hostname(self) -> str | None: 

120 """For backwards-compatibility with urlparse. We're nice like that.""" 

121 return self.host 

122 

123 @property 

124 def request_uri(self) -> str: 

125 """Absolute path including the query string.""" 

126 uri = self.path or "/" 

127 

128 if self.query is not None: 

129 uri += "?" + self.query 

130 

131 return uri 

132 

133 @property 

134 def authority(self) -> str | None: 

135 """ 

136 Authority component as defined in RFC 3986 3.2. 

137 This includes userinfo (auth), host and port. 

138 

139 i.e. 

140 userinfo@host:port 

141 """ 

142 userinfo = self.auth 

143 netloc = self.netloc 

144 if netloc is None or userinfo is None: 

145 return netloc 

146 else: 

147 return f"{userinfo}@{netloc}" 

148 

149 @property 

150 def netloc(self) -> str | None: 

151 """ 

152 Network location including host and port. 

153 

154 If you need the equivalent of urllib.parse's ``netloc``, 

155 use the ``authority`` property instead. 

156 """ 

157 if self.host is None: 

158 return None 

159 if self.port: 

160 return f"{self.host}:{self.port}" 

161 return self.host 

162 

163 @property 

164 def url(self) -> str: 

165 """ 

166 Convert self into a url 

167 

168 This function should more or less round-trip with :func:`.parse_url`. The 

169 returned url may not be exactly the same as the url inputted to 

170 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls 

171 with a blank port will have : removed). 

172 

173 Example: 

174 

175 .. code-block:: python 

176 

177 import urllib3 

178 

179 U = urllib3.util.parse_url("https://google.com/mail/") 

180 

181 print(U.url) 

182 # "https://google.com/mail/" 

183 

184 print( urllib3.util.Url("https", "username:password", 

185 "host.com", 80, "/path", "query", "fragment" 

186 ).url 

187 ) 

188 # "https://username:password@host.com:80/path?query#fragment" 

189 """ 

190 scheme, auth, host, port, path, query, fragment = self 

191 url = "" 

192 

193 # We use "is not None" we want things to happen with empty strings (or 0 port) 

194 if scheme is not None: 

195 url += scheme + "://" 

196 if auth is not None: 

197 url += auth + "@" 

198 if host is not None: 

199 url += host 

200 if port is not None: 

201 url += ":" + str(port) 

202 if path is not None: 

203 url += path 

204 if query is not None: 

205 url += "?" + query 

206 if fragment is not None: 

207 url += "#" + fragment 

208 

209 return url 

210 

211 def __str__(self) -> str: 

212 return self.url 

213 

214 

215@typing.overload 

216def _encode_invalid_chars( 

217 component: str, allowed_chars: typing.Container[str] 

218) -> str: # Abstract 

219 ... 

220 

221 

222@typing.overload 

223def _encode_invalid_chars( 

224 component: None, allowed_chars: typing.Container[str] 

225) -> None: # Abstract 

226 ... 

227 

228 

229def _encode_invalid_chars( 

230 component: str | None, allowed_chars: typing.Container[str] 

231) -> str | None: 

232 """Percent-encodes a URI component without reapplying 

233 onto an already percent-encoded component. 

234 """ 

235 if component is None: 

236 return component 

237 

238 component = to_str(component) 

239 

240 # Normalize existing percent-encoded bytes. 

241 # Try to see if the component we're encoding is already percent-encoded 

242 # so we can skip all '%' characters but still encode all others. 

243 component, percent_encodings = _PERCENT_RE.subn( 

244 lambda match: match.group(0).upper(), component 

245 ) 

246 

247 uri_bytes = component.encode("utf-8", "surrogatepass") 

248 is_percent_encoded = percent_encodings == uri_bytes.count(b"%") 

249 encoded_component = bytearray() 

250 

251 for i in range(0, len(uri_bytes)): 

252 # Will return a single character bytestring 

253 byte = uri_bytes[i : i + 1] 

254 byte_ord = ord(byte) 

255 if (is_percent_encoded and byte == b"%") or ( 

256 byte_ord < 128 and byte.decode() in allowed_chars 

257 ): 

258 encoded_component += byte 

259 continue 

260 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper())) 

261 

262 return encoded_component.decode() 

263 

264 

265def _remove_path_dot_segments(path: str) -> str: 

266 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code 

267 segments = path.split("/") # Turn the path into a list of segments 

268 output = [] # Initialize the variable to use to store output 

269 

270 for segment in segments: 

271 # '.' is the current directory, so ignore it, it is superfluous 

272 if segment == ".": 

273 continue 

274 # Anything other than '..', should be appended to the output 

275 if segment != "..": 

276 output.append(segment) 

277 # In this case segment == '..', if we can, we should pop the last 

278 # element 

279 elif output: 

280 output.pop() 

281 

282 # If the path starts with '/' and the output is empty or the first string 

283 # is non-empty 

284 if path.startswith("/") and (not output or output[0]): 

285 output.insert(0, "") 

286 

287 # If the path starts with '/.' or '/..' ensure we add one more empty 

288 # string to add a trailing '/' 

289 if path.endswith(("/.", "/..")): 

290 output.append("") 

291 

292 return "/".join(output) 

293 

294 

295@typing.overload 

296def _normalize_host(host: None, scheme: str | None) -> None: ... 

297 

298 

299@typing.overload 

300def _normalize_host(host: str, scheme: str | None) -> str: ... 

301 

302 

303def _normalize_host(host: str | None, scheme: str | None) -> str | None: 

304 if host: 

305 if scheme in _NORMALIZABLE_SCHEMES: 

306 is_ipv6 = _IPV6_ADDRZ_RE.match(host) 

307 if is_ipv6: 

308 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as 

309 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID 

310 # separator as necessary to return a valid RFC 4007 scoped IP. 

311 match = _ZONE_ID_RE.search(host) 

312 if match: 

313 start, end = match.span(1) 

314 zone_id = host[start:end] 

315 

316 if zone_id.startswith("%25") and zone_id != "%25": 

317 zone_id = zone_id[3:] 

318 else: 

319 zone_id = zone_id[1:] 

320 zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS) 

321 return f"{host[:start].lower()}%{zone_id}{host[end:]}" 

322 else: 

323 return host.lower() 

324 elif not _IPV4_RE.match(host): 

325 return to_str( 

326 b".".join([_idna_encode(label) for label in host.split(".")]), 

327 "ascii", 

328 ) 

329 return host 

330 

331 

332def _idna_encode(name: str) -> bytes: 

333 if not name.isascii(): 

334 try: 

335 import idna 

336 except ImportError: 

337 raise LocationParseError( 

338 "Unable to parse URL without the 'idna' module" 

339 ) from None 

340 

341 try: 

342 return idna.encode(name.lower(), strict=True, std3_rules=True) 

343 except idna.IDNAError: 

344 raise LocationParseError( 

345 f"Name '{name}' is not a valid IDNA label" 

346 ) from None 

347 

348 return name.lower().encode("ascii") 

349 

350 

351def _encode_target(target: str) -> str: 

352 """Percent-encodes a request target so that there are no invalid characters 

353 

354 Pre-condition for this function is that 'target' must start with '/'. 

355 If that is the case then _TARGET_RE will always produce a match. 

356 """ 

357 match = _TARGET_RE.match(target) 

358 if not match: # Defensive: 

359 raise LocationParseError(f"{target!r} is not a valid request URI") 

360 

361 path, query = match.groups() 

362 encoded_target = _encode_invalid_chars(path, _PATH_CHARS) 

363 if query is not None: 

364 query = _encode_invalid_chars(query, _QUERY_CHARS) 

365 encoded_target += "?" + query 

366 return encoded_target 

367 

368 

369def parse_url(url: str) -> Url: 

370 """ 

371 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is 

372 performed to parse incomplete urls. Fields not provided will be None. 

373 This parser is RFC 3986 and RFC 6874 compliant. 

374 

375 The parser logic and helper functions are based heavily on 

376 work done in the ``rfc3986`` module. 

377 

378 :param str url: URL to parse into a :class:`.Url` namedtuple. 

379 

380 Partly backwards-compatible with :mod:`urllib.parse`. 

381 

382 Example: 

383 

384 .. code-block:: python 

385 

386 import urllib3 

387 

388 print( urllib3.util.parse_url('http://google.com/mail/')) 

389 # Url(scheme='http', host='google.com', port=None, path='/mail/', ...) 

390 

391 print( urllib3.util.parse_url('google.com:80')) 

392 # Url(scheme=None, host='google.com', port=80, path=None, ...) 

393 

394 print( urllib3.util.parse_url('/foo?bar')) 

395 # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) 

396 """ 

397 if not url: 

398 # Empty 

399 return Url() 

400 

401 source_url = url 

402 if not _SCHEME_RE.search(url): 

403 url = "//" + url 

404 

405 scheme: str | None 

406 authority: str | None 

407 auth: str | None 

408 host: str | None 

409 port: str | None 

410 port_int: int | None 

411 path: str | None 

412 query: str | None 

413 fragment: str | None 

414 

415 try: 

416 scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr] 

417 normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES 

418 

419 if scheme: 

420 scheme = scheme.lower() 

421 

422 if authority: 

423 auth, _, host_port = authority.rpartition("@") 

424 auth = auth or None 

425 host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr] 

426 if auth and normalize_uri: 

427 auth = _encode_invalid_chars(auth, _USERINFO_CHARS) 

428 if port == "": 

429 port = None 

430 else: 

431 auth, host, port = None, None, None 

432 

433 if port is not None: 

434 port_int = int(port) 

435 if not (0 <= port_int <= 65535): 

436 raise LocationParseError(url) 

437 else: 

438 port_int = None 

439 

440 host = _normalize_host(host, scheme) 

441 

442 if normalize_uri and path: 

443 path = _remove_path_dot_segments(path) 

444 path = _encode_invalid_chars(path, _PATH_CHARS) 

445 if normalize_uri and query: 

446 query = _encode_invalid_chars(query, _QUERY_CHARS) 

447 if normalize_uri and fragment: 

448 fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS) 

449 

450 except (ValueError, AttributeError) as e: 

451 raise LocationParseError(source_url) from e 

452 

453 # For the sake of backwards compatibility we put empty 

454 # string values for path if there are any defined values 

455 # beyond the path in the URL. 

456 # TODO: Remove this when we break backwards compatibility. 

457 if not path: 

458 if query is not None or fragment is not None: 

459 path = "" 

460 else: 

461 path = None 

462 

463 return Url( 

464 scheme=scheme, 

465 auth=auth, 

466 host=host, 

467 port=port_int, 

468 path=path, 

469 query=query, 

470 fragment=fragment, 

471 )