Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/urllib/parse.py: 26%

589 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-20 07:00 +0000

1"""Parse (absolute and relative) URLs. 

2 

3urlparse module is based upon the following RFC specifications. 

4 

5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 

6and L. Masinter, January 2005. 

7 

8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 

9and L.Masinter, December 1999. 

10 

11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 

12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 

13 

14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 

15 

16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 

171995. 

18 

19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 

20McCahill, December 1994 

21 

22RFC 3986 is considered the current standard and any future changes to 

23urlparse module should conform with it. The urlparse module is 

24currently not entirely compliant with this RFC due to defacto 

25scenarios for parsing, and for backward compatibility purposes, some 

26parsing quirks from older RFCs are retained. The testcases in 

27test_urlparse.py provides a good indicator of parsing behavior. 

28""" 

29 

30import re 

31import sys 

32import types 

33import collections 

34import warnings 

35 

36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 

37 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 

38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 

39 "unquote", "unquote_plus", "unquote_to_bytes", 

40 "DefragResult", "ParseResult", "SplitResult", 

41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 

42 

43# A classification of schemes. 

44# The empty string classifies URLs with no scheme specified, 

45# being the default value returned by “urlsplit” and “urlparse”. 

46 

47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 

48 'wais', 'file', 'https', 'shttp', 'mms', 

49 'prospero', 'rtsp', 'rtspu', 'sftp', 

50 'svn', 'svn+ssh', 'ws', 'wss'] 

51 

52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 

53 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 

54 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 

55 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 

56 'ws', 'wss'] 

57 

58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 

59 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 

60 'mms', 'sftp', 'tel'] 

61 

62# These are not actually used anymore, but should stay for backwards 

63# compatibility. (They are undocumented, but have a public-looking name.) 

64 

65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 

66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 

67 

68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 

69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 

70 

71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 

72 'nntp', 'wais', 'https', 'shttp', 'snews', 

73 'file', 'prospero'] 

74 

75# Characters valid in scheme names 

76scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 

77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 

78 '0123456789' 

79 '+-.') 

80 

81# Unsafe bytes to be removed per WHATWG spec 

82_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] 

83 

84# XXX: Consider replacing with functools.lru_cache 

85MAX_CACHE_SIZE = 20 

86_parse_cache = {} 

87 

88def clear_cache(): 

89 """Clear the parse cache and the quoters cache.""" 

90 _parse_cache.clear() 

91 _safe_quoters.clear() 

92 

93 

94# Helpers for bytes handling 

95# For 3.2, we deliberately require applications that 

96# handle improperly quoted URLs to do their own 

97# decoding and encoding. If valid use cases are 

98# presented, we may relax this by using latin-1 

99# decoding internally for 3.3 

100_implicit_encoding = 'ascii' 

101_implicit_errors = 'strict' 

102 

103def _noop(obj): 

104 return obj 

105 

106def _encode_result(obj, encoding=_implicit_encoding, 

107 errors=_implicit_errors): 

108 return obj.encode(encoding, errors) 

109 

110def _decode_args(args, encoding=_implicit_encoding, 

111 errors=_implicit_errors): 

112 return tuple(x.decode(encoding, errors) if x else '' for x in args) 

113 

114def _coerce_args(*args): 

115 # Invokes decode if necessary to create str args 

116 # and returns the coerced inputs along with 

117 # an appropriate result coercion function 

118 # - noop for str inputs 

119 # - encoding function otherwise 

120 str_input = isinstance(args[0], str) 

121 for arg in args[1:]: 

122 # We special-case the empty string to support the 

123 # "scheme=''" default argument to some functions 

124 if arg and isinstance(arg, str) != str_input: 

125 raise TypeError("Cannot mix str and non-str arguments") 

126 if str_input: 

127 return args + (_noop,) 

128 return _decode_args(args) + (_encode_result,) 

129 

130# Result objects are more helpful than simple tuples 

131class _ResultMixinStr(object): 

132 """Standard approach to encoding parsed results from str to bytes""" 

133 __slots__ = () 

134 

135 def encode(self, encoding='ascii', errors='strict'): 

136 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 

137 

138 

139class _ResultMixinBytes(object): 

140 """Standard approach to decoding parsed results from bytes to str""" 

141 __slots__ = () 

142 

143 def decode(self, encoding='ascii', errors='strict'): 

144 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 

145 

146 

147class _NetlocResultMixinBase(object): 

148 """Shared methods for the parsed result objects containing a netloc element""" 

149 __slots__ = () 

150 

151 @property 

152 def username(self): 

153 return self._userinfo[0] 

154 

155 @property 

156 def password(self): 

157 return self._userinfo[1] 

158 

159 @property 

160 def hostname(self): 

161 hostname = self._hostinfo[0] 

162 if not hostname: 

163 return None 

164 # Scoped IPv6 address may have zone info, which must not be lowercased 

165 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 

166 separator = '%' if isinstance(hostname, str) else b'%' 

167 hostname, percent, zone = hostname.partition(separator) 

168 return hostname.lower() + percent + zone 

169 

170 @property 

171 def port(self): 

172 port = self._hostinfo[1] 

173 if port is not None: 

174 try: 

175 port = int(port, 10) 

176 except ValueError: 

177 message = f'Port could not be cast to integer value as {port!r}' 

178 raise ValueError(message) from None 

179 if not ( 0 <= port <= 65535): 

180 raise ValueError("Port out of range 0-65535") 

181 return port 

182 

183 __class_getitem__ = classmethod(types.GenericAlias) 

184 

185 

186class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 

187 __slots__ = () 

188 

189 @property 

190 def _userinfo(self): 

191 netloc = self.netloc 

192 userinfo, have_info, hostinfo = netloc.rpartition('@') 

193 if have_info: 

194 username, have_password, password = userinfo.partition(':') 

195 if not have_password: 

196 password = None 

197 else: 

198 username = password = None 

199 return username, password 

200 

201 @property 

202 def _hostinfo(self): 

203 netloc = self.netloc 

204 _, _, hostinfo = netloc.rpartition('@') 

205 _, have_open_br, bracketed = hostinfo.partition('[') 

206 if have_open_br: 

207 hostname, _, port = bracketed.partition(']') 

208 _, _, port = port.partition(':') 

209 else: 

210 hostname, _, port = hostinfo.partition(':') 

211 if not port: 

212 port = None 

213 return hostname, port 

214 

215 

216class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 

217 __slots__ = () 

218 

219 @property 

220 def _userinfo(self): 

221 netloc = self.netloc 

222 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 

223 if have_info: 

224 username, have_password, password = userinfo.partition(b':') 

225 if not have_password: 

226 password = None 

227 else: 

228 username = password = None 

229 return username, password 

230 

231 @property 

232 def _hostinfo(self): 

233 netloc = self.netloc 

234 _, _, hostinfo = netloc.rpartition(b'@') 

235 _, have_open_br, bracketed = hostinfo.partition(b'[') 

236 if have_open_br: 

237 hostname, _, port = bracketed.partition(b']') 

238 _, _, port = port.partition(b':') 

239 else: 

240 hostname, _, port = hostinfo.partition(b':') 

241 if not port: 

242 port = None 

243 return hostname, port 

244 

245 

246from collections import namedtuple 

247 

248_DefragResultBase = namedtuple('DefragResult', 'url fragment') 

249_SplitResultBase = namedtuple( 

250 'SplitResult', 'scheme netloc path query fragment') 

251_ParseResultBase = namedtuple( 

252 'ParseResult', 'scheme netloc path params query fragment') 

253 

254_DefragResultBase.__doc__ = """ 

255DefragResult(url, fragment) 

256 

257A 2-tuple that contains the url without fragment identifier and the fragment 

258identifier as a separate argument. 

259""" 

260 

261_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 

262 

263_DefragResultBase.fragment.__doc__ = """ 

264Fragment identifier separated from URL, that allows indirect identification of a 

265secondary resource by reference to a primary resource and additional identifying 

266information. 

267""" 

268 

269_SplitResultBase.__doc__ = """ 

270SplitResult(scheme, netloc, path, query, fragment) 

271 

272A 5-tuple that contains the different components of a URL. Similar to 

273ParseResult, but does not split params. 

274""" 

275 

276_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 

277 

278_SplitResultBase.netloc.__doc__ = """ 

279Network location where the request is made to. 

280""" 

281 

282_SplitResultBase.path.__doc__ = """ 

283The hierarchical path, such as the path to a file to download. 

284""" 

285 

286_SplitResultBase.query.__doc__ = """ 

287The query component, that contains non-hierarchical data, that along with data 

288in path component, identifies a resource in the scope of URI's scheme and 

289network location. 

290""" 

291 

292_SplitResultBase.fragment.__doc__ = """ 

293Fragment identifier, that allows indirect identification of a secondary resource 

294by reference to a primary resource and additional identifying information. 

295""" 

296 

297_ParseResultBase.__doc__ = """ 

298ParseResult(scheme, netloc, path, params, query, fragment) 

299 

300A 6-tuple that contains components of a parsed URL. 

301""" 

302 

303_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 

304_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 

305_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 

306_ParseResultBase.params.__doc__ = """ 

307Parameters for last path element used to dereference the URI in order to provide 

308access to perform some operation on the resource. 

309""" 

310 

311_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 

312_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 

313 

314 

315# For backwards compatibility, alias _NetlocResultMixinStr 

316# ResultBase is no longer part of the documented API, but it is 

317# retained since deprecating it isn't worth the hassle 

318ResultBase = _NetlocResultMixinStr 

319 

320# Structured result objects for string data 

321class DefragResult(_DefragResultBase, _ResultMixinStr): 

322 __slots__ = () 

323 def geturl(self): 

324 if self.fragment: 

325 return self.url + '#' + self.fragment 

326 else: 

327 return self.url 

328 

329class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 

330 __slots__ = () 

331 def geturl(self): 

332 return urlunsplit(self) 

333 

334class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 

335 __slots__ = () 

336 def geturl(self): 

337 return urlunparse(self) 

338 

339# Structured result objects for bytes data 

340class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 

341 __slots__ = () 

342 def geturl(self): 

343 if self.fragment: 

344 return self.url + b'#' + self.fragment 

345 else: 

346 return self.url 

347 

348class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 

349 __slots__ = () 

350 def geturl(self): 

351 return urlunsplit(self) 

352 

353class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 

354 __slots__ = () 

355 def geturl(self): 

356 return urlunparse(self) 

357 

358# Set up the encode/decode result pairs 

359def _fix_result_transcoding(): 

360 _result_pairs = ( 

361 (DefragResult, DefragResultBytes), 

362 (SplitResult, SplitResultBytes), 

363 (ParseResult, ParseResultBytes), 

364 ) 

365 for _decoded, _encoded in _result_pairs: 

366 _decoded._encoded_counterpart = _encoded 

367 _encoded._decoded_counterpart = _decoded 

368 

369_fix_result_transcoding() 

370del _fix_result_transcoding 

371 

372def urlparse(url, scheme='', allow_fragments=True): 

373 """Parse a URL into 6 components: 

374 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 

375 

376 The result is a named 6-tuple with fields corresponding to the 

377 above. It is either a ParseResult or ParseResultBytes object, 

378 depending on the type of the url parameter. 

379 

380 The username, password, hostname, and port sub-components of netloc 

381 can also be accessed as attributes of the returned object. 

382 

383 The scheme argument provides the default value of the scheme 

384 component when no scheme is found in url. 

385 

386 If allow_fragments is False, no attempt is made to separate the 

387 fragment component from the previous component, which can be either 

388 path or query. 

389 

390 Note that % escapes are not expanded. 

391 """ 

392 url, scheme, _coerce_result = _coerce_args(url, scheme) 

393 splitresult = urlsplit(url, scheme, allow_fragments) 

394 scheme, netloc, url, query, fragment = splitresult 

395 if scheme in uses_params and ';' in url: 

396 url, params = _splitparams(url) 

397 else: 

398 params = '' 

399 result = ParseResult(scheme, netloc, url, params, query, fragment) 

400 return _coerce_result(result) 

401 

402def _splitparams(url): 

403 if '/' in url: 

404 i = url.find(';', url.rfind('/')) 

405 if i < 0: 

406 return url, '' 

407 else: 

408 i = url.find(';') 

409 return url[:i], url[i+1:] 

410 

411def _splitnetloc(url, start=0): 

412 delim = len(url) # position of end of domain part of url, default is end 

413 for c in '/?#': # look for delimiters; the order is NOT important 

414 wdelim = url.find(c, start) # find first of this delim 

415 if wdelim >= 0: # if found 

416 delim = min(delim, wdelim) # use earliest delim position 

417 return url[start:delim], url[delim:] # return (domain, rest) 

418 

419def _checknetloc(netloc): 

420 if not netloc or netloc.isascii(): 

421 return 

422 # looking for characters like \u2100 that expand to 'a/c' 

423 # IDNA uses NFKC equivalence, so normalize for this check 

424 import unicodedata 

425 n = netloc.replace('@', '') # ignore characters already included 

426 n = n.replace(':', '') # but not the surrounding text 

427 n = n.replace('#', '') 

428 n = n.replace('?', '') 

429 netloc2 = unicodedata.normalize('NFKC', n) 

430 if n == netloc2: 

431 return 

432 for c in '/?#@:': 

433 if c in netloc2: 

434 raise ValueError("netloc '" + netloc + "' contains invalid " + 

435 "characters under NFKC normalization") 

436 

437def urlsplit(url, scheme='', allow_fragments=True): 

438 """Parse a URL into 5 components: 

439 <scheme>://<netloc>/<path>?<query>#<fragment> 

440 

441 The result is a named 5-tuple with fields corresponding to the 

442 above. It is either a SplitResult or SplitResultBytes object, 

443 depending on the type of the url parameter. 

444 

445 The username, password, hostname, and port sub-components of netloc 

446 can also be accessed as attributes of the returned object. 

447 

448 The scheme argument provides the default value of the scheme 

449 component when no scheme is found in url. 

450 

451 If allow_fragments is False, no attempt is made to separate the 

452 fragment component from the previous component, which can be either 

453 path or query. 

454 

455 Note that % escapes are not expanded. 

456 """ 

457 

458 url, scheme, _coerce_result = _coerce_args(url, scheme) 

459 allow_fragments = bool(allow_fragments) 

460 key = url, scheme, allow_fragments, type(url), type(scheme) 

461 cached = _parse_cache.get(key, None) 

462 if cached: 

463 return _coerce_result(cached) 

464 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 

465 clear_cache() 

466 netloc = query = fragment = '' 

467 i = url.find(':') 

468 if i > 0: 

469 for c in url[:i]: 

470 if c not in scheme_chars: 

471 break 

472 else: 

473 scheme, url = url[:i].lower(), url[i+1:] 

474 

475 for b in _UNSAFE_URL_BYTES_TO_REMOVE: 

476 url = url.replace(b, "") 

477 

478 if url[:2] == '//': 

479 netloc, url = _splitnetloc(url, 2) 

480 if (('[' in netloc and ']' not in netloc) or 

481 (']' in netloc and '[' not in netloc)): 

482 raise ValueError("Invalid IPv6 URL") 

483 if allow_fragments and '#' in url: 

484 url, fragment = url.split('#', 1) 

485 if '?' in url: 

486 url, query = url.split('?', 1) 

487 _checknetloc(netloc) 

488 v = SplitResult(scheme, netloc, url, query, fragment) 

489 _parse_cache[key] = v 

490 return _coerce_result(v) 

491 

492def urlunparse(components): 

493 """Put a parsed URL back together again. This may result in a 

494 slightly different, but equivalent URL, if the URL that was parsed 

495 originally had redundant delimiters, e.g. a ? with an empty query 

496 (the draft states that these are equivalent).""" 

497 scheme, netloc, url, params, query, fragment, _coerce_result = ( 

498 _coerce_args(*components)) 

499 if params: 

500 url = "%s;%s" % (url, params) 

501 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 

502 

503def urlunsplit(components): 

504 """Combine the elements of a tuple as returned by urlsplit() into a 

505 complete URL as a string. The data argument can be any five-item iterable. 

506 This may result in a slightly different, but equivalent URL, if the URL that 

507 was parsed originally had unnecessary delimiters (for example, a ? with an 

508 empty query; the RFC states that these are equivalent).""" 

509 scheme, netloc, url, query, fragment, _coerce_result = ( 

510 _coerce_args(*components)) 

511 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 

512 if url and url[:1] != '/': url = '/' + url 

513 url = '//' + (netloc or '') + url 

514 if scheme: 

515 url = scheme + ':' + url 

516 if query: 

517 url = url + '?' + query 

518 if fragment: 

519 url = url + '#' + fragment 

520 return _coerce_result(url) 

521 

522def urljoin(base, url, allow_fragments=True): 

523 """Join a base URL and a possibly relative URL to form an absolute 

524 interpretation of the latter.""" 

525 if not base: 

526 return url 

527 if not url: 

528 return base 

529 

530 base, url, _coerce_result = _coerce_args(base, url) 

531 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 

532 urlparse(base, '', allow_fragments) 

533 scheme, netloc, path, params, query, fragment = \ 

534 urlparse(url, bscheme, allow_fragments) 

535 

536 if scheme != bscheme or scheme not in uses_relative: 

537 return _coerce_result(url) 

538 if scheme in uses_netloc: 

539 if netloc: 

540 return _coerce_result(urlunparse((scheme, netloc, path, 

541 params, query, fragment))) 

542 netloc = bnetloc 

543 

544 if not path and not params: 

545 path = bpath 

546 params = bparams 

547 if not query: 

548 query = bquery 

549 return _coerce_result(urlunparse((scheme, netloc, path, 

550 params, query, fragment))) 

551 

552 base_parts = bpath.split('/') 

553 if base_parts[-1] != '': 

554 # the last item is not a directory, so will not be taken into account 

555 # in resolving the relative path 

556 del base_parts[-1] 

557 

558 # for rfc3986, ignore all base path should the first character be root. 

559 if path[:1] == '/': 

560 segments = path.split('/') 

561 else: 

562 segments = base_parts + path.split('/') 

563 # filter out elements that would cause redundant slashes on re-joining 

564 # the resolved_path 

565 segments[1:-1] = filter(None, segments[1:-1]) 

566 

567 resolved_path = [] 

568 

569 for seg in segments: 

570 if seg == '..': 

571 try: 

572 resolved_path.pop() 

573 except IndexError: 

574 # ignore any .. segments that would otherwise cause an IndexError 

575 # when popped from resolved_path if resolving for rfc3986 

576 pass 

577 elif seg == '.': 

578 continue 

579 else: 

580 resolved_path.append(seg) 

581 

582 if segments[-1] in ('.', '..'): 

583 # do some post-processing here. if the last segment was a relative dir, 

584 # then we need to append the trailing '/' 

585 resolved_path.append('') 

586 

587 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 

588 resolved_path) or '/', params, query, fragment))) 

589 

590 

591def urldefrag(url): 

592 """Removes any existing fragment from URL. 

593 

594 Returns a tuple of the defragmented URL and the fragment. If 

595 the URL contained no fragments, the second element is the 

596 empty string. 

597 """ 

598 url, _coerce_result = _coerce_args(url) 

599 if '#' in url: 

600 s, n, p, a, q, frag = urlparse(url) 

601 defrag = urlunparse((s, n, p, a, q, '')) 

602 else: 

603 frag = '' 

604 defrag = url 

605 return _coerce_result(DefragResult(defrag, frag)) 

606 

607_hexdig = '0123456789ABCDEFabcdef' 

608_hextobyte = None 

609 

610def unquote_to_bytes(string): 

611 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 

612 # Note: strings are encoded as UTF-8. This is only an issue if it contains 

613 # unescaped non-ASCII characters, which URIs should not. 

614 if not string: 

615 # Is it a string-like object? 

616 string.split 

617 return b'' 

618 if isinstance(string, str): 

619 string = string.encode('utf-8') 

620 bits = string.split(b'%') 

621 if len(bits) == 1: 

622 return string 

623 res = [bits[0]] 

624 append = res.append 

625 # Delay the initialization of the table to not waste memory 

626 # if the function is never called 

627 global _hextobyte 

628 if _hextobyte is None: 

629 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 

630 for a in _hexdig for b in _hexdig} 

631 for item in bits[1:]: 

632 try: 

633 append(_hextobyte[item[:2]]) 

634 append(item[2:]) 

635 except KeyError: 

636 append(b'%') 

637 append(item) 

638 return b''.join(res) 

639 

640_asciire = re.compile('([\x00-\x7f]+)') 

641 

642def unquote(string, encoding='utf-8', errors='replace'): 

643 """Replace %xx escapes by their single-character equivalent. The optional 

644 encoding and errors parameters specify how to decode percent-encoded 

645 sequences into Unicode characters, as accepted by the bytes.decode() 

646 method. 

647 By default, percent-encoded sequences are decoded with UTF-8, and invalid 

648 sequences are replaced by a placeholder character. 

649 

650 unquote('abc%20def') -> 'abc def'. 

651 """ 

652 if isinstance(string, bytes): 

653 return unquote_to_bytes(string).decode(encoding, errors) 

654 if '%' not in string: 

655 string.split 

656 return string 

657 if encoding is None: 

658 encoding = 'utf-8' 

659 if errors is None: 

660 errors = 'replace' 

661 bits = _asciire.split(string) 

662 res = [bits[0]] 

663 append = res.append 

664 for i in range(1, len(bits), 2): 

665 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 

666 append(bits[i + 1]) 

667 return ''.join(res) 

668 

669 

670def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 

671 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 

672 """Parse a query given as a string argument. 

673 

674 Arguments: 

675 

676 qs: percent-encoded query string to be parsed 

677 

678 keep_blank_values: flag indicating whether blank values in 

679 percent-encoded queries should be treated as blank strings. 

680 A true value indicates that blanks should be retained as 

681 blank strings. The default false value indicates that 

682 blank values are to be ignored and treated as if they were 

683 not included. 

684 

685 strict_parsing: flag indicating what to do with parsing errors. 

686 If false (the default), errors are silently ignored. 

687 If true, errors raise a ValueError exception. 

688 

689 encoding and errors: specify how to decode percent-encoded sequences 

690 into Unicode characters, as accepted by the bytes.decode() method. 

691 

692 max_num_fields: int. If set, then throws a ValueError if there 

693 are more than n fields read by parse_qsl(). 

694 

695 separator: str. The symbol to use for separating the query arguments. 

696 Defaults to &. 

697 

698 Returns a dictionary. 

699 """ 

700 parsed_result = {} 

701 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 

702 encoding=encoding, errors=errors, 

703 max_num_fields=max_num_fields, separator=separator) 

704 for name, value in pairs: 

705 if name in parsed_result: 

706 parsed_result[name].append(value) 

707 else: 

708 parsed_result[name] = [value] 

709 return parsed_result 

710 

711 

712def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 

713 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 

714 """Parse a query given as a string argument. 

715 

716 Arguments: 

717 

718 qs: percent-encoded query string to be parsed 

719 

720 keep_blank_values: flag indicating whether blank values in 

721 percent-encoded queries should be treated as blank strings. 

722 A true value indicates that blanks should be retained as blank 

723 strings. The default false value indicates that blank values 

724 are to be ignored and treated as if they were not included. 

725 

726 strict_parsing: flag indicating what to do with parsing errors. If 

727 false (the default), errors are silently ignored. If true, 

728 errors raise a ValueError exception. 

729 

730 encoding and errors: specify how to decode percent-encoded sequences 

731 into Unicode characters, as accepted by the bytes.decode() method. 

732 

733 max_num_fields: int. If set, then throws a ValueError 

734 if there are more than n fields read by parse_qsl(). 

735 

736 separator: str. The symbol to use for separating the query arguments. 

737 Defaults to &. 

738 

739 Returns a list, as G-d intended. 

740 """ 

741 qs, _coerce_result = _coerce_args(qs) 

742 separator, _ = _coerce_args(separator) 

743 

744 if not separator or (not isinstance(separator, (str, bytes))): 

745 raise ValueError("Separator must be of type string or bytes.") 

746 

747 # If max_num_fields is defined then check that the number of fields 

748 # is less than max_num_fields. This prevents a memory exhaustion DOS 

749 # attack via post bodies with many fields. 

750 if max_num_fields is not None: 

751 num_fields = 1 + qs.count(separator) 

752 if max_num_fields < num_fields: 

753 raise ValueError('Max number of fields exceeded') 

754 

755 pairs = [s1 for s1 in qs.split(separator)] 

756 r = [] 

757 for name_value in pairs: 

758 if not name_value and not strict_parsing: 

759 continue 

760 nv = name_value.split('=', 1) 

761 if len(nv) != 2: 

762 if strict_parsing: 

763 raise ValueError("bad query field: %r" % (name_value,)) 

764 # Handle case of a control-name with no equal sign 

765 if keep_blank_values: 

766 nv.append('') 

767 else: 

768 continue 

769 if len(nv[1]) or keep_blank_values: 

770 name = nv[0].replace('+', ' ') 

771 name = unquote(name, encoding=encoding, errors=errors) 

772 name = _coerce_result(name) 

773 value = nv[1].replace('+', ' ') 

774 value = unquote(value, encoding=encoding, errors=errors) 

775 value = _coerce_result(value) 

776 r.append((name, value)) 

777 return r 

778 

779def unquote_plus(string, encoding='utf-8', errors='replace'): 

780 """Like unquote(), but also replace plus signs by spaces, as required for 

781 unquoting HTML form values. 

782 

783 unquote_plus('%7e/abc+def') -> '~/abc def' 

784 """ 

785 string = string.replace('+', ' ') 

786 return unquote(string, encoding, errors) 

787 

788_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 

789 b'abcdefghijklmnopqrstuvwxyz' 

790 b'0123456789' 

791 b'_.-~') 

792_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 

793_safe_quoters = {} 

794 

795class Quoter(collections.defaultdict): 

796 """A mapping from bytes (in range(0,256)) to strings. 

797 

798 String values are percent-encoded byte values, unless the key < 128, and 

799 in the "safe" set (either the specified safe set, or default set). 

800 """ 

801 # Keeps a cache internally, using defaultdict, for efficiency (lookups 

802 # of cached keys don't call Python code at all). 

803 def __init__(self, safe): 

804 """safe: bytes object.""" 

805 self.safe = _ALWAYS_SAFE.union(safe) 

806 

807 def __repr__(self): 

808 # Without this, will just display as a defaultdict 

809 return "<%s %r>" % (self.__class__.__name__, dict(self)) 

810 

811 def __missing__(self, b): 

812 # Handle a cache miss. Store quoted string in cache and return. 

813 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 

814 self[b] = res 

815 return res 

816 

817def quote(string, safe='/', encoding=None, errors=None): 

818 """quote('abc def') -> 'abc%20def' 

819 

820 Each part of a URL, e.g. the path info, the query, etc., has a 

821 different set of reserved characters that must be quoted. The 

822 quote function offers a cautious (not minimal) way to quote a 

823 string for most of these parts. 

824 

825 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists 

826 the following (un)reserved characters. 

827 

828 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 

829 reserved = gen-delims / sub-delims 

830 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 

831 sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 

832 / "*" / "+" / "," / ";" / "=" 

833 

834 Each of the reserved characters is reserved in some component of a URL, 

835 but not necessarily in all of them. 

836 

837 The quote function %-escapes all characters that are neither in the 

838 unreserved chars ("always safe") nor the additional chars set via the 

839 safe arg. 

840 

841 The default for the safe arg is '/'. The character is reserved, but in 

842 typical usage the quote function is being called on a path where the 

843 existing slash characters are to be preserved. 

844 

845 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 

846 Now, "~" is included in the set of unreserved characters. 

847 

848 string and safe may be either str or bytes objects. encoding and errors 

849 must not be specified if string is a bytes object. 

850 

851 The optional encoding and errors parameters specify how to deal with 

852 non-ASCII characters, as accepted by the str.encode method. 

853 By default, encoding='utf-8' (characters are encoded with UTF-8), and 

854 errors='strict' (unsupported characters raise a UnicodeEncodeError). 

855 """ 

856 if isinstance(string, str): 

857 if not string: 

858 return string 

859 if encoding is None: 

860 encoding = 'utf-8' 

861 if errors is None: 

862 errors = 'strict' 

863 string = string.encode(encoding, errors) 

864 else: 

865 if encoding is not None: 

866 raise TypeError("quote() doesn't support 'encoding' for bytes") 

867 if errors is not None: 

868 raise TypeError("quote() doesn't support 'errors' for bytes") 

869 return quote_from_bytes(string, safe) 

870 

871def quote_plus(string, safe='', encoding=None, errors=None): 

872 """Like quote(), but also replace ' ' with '+', as required for quoting 

873 HTML form values. Plus signs in the original string are escaped unless 

874 they are included in safe. It also does not have safe default to '/'. 

875 """ 

876 # Check if ' ' in string, where string may either be a str or bytes. If 

877 # there are no spaces, the regular quote will produce the right answer. 

878 if ((isinstance(string, str) and ' ' not in string) or 

879 (isinstance(string, bytes) and b' ' not in string)): 

880 return quote(string, safe, encoding, errors) 

881 if isinstance(safe, str): 

882 space = ' ' 

883 else: 

884 space = b' ' 

885 string = quote(string, safe + space, encoding, errors) 

886 return string.replace(' ', '+') 

887 

888def quote_from_bytes(bs, safe='/'): 

889 """Like quote(), but accepts a bytes object rather than a str, and does 

890 not perform string-to-bytes encoding. It always returns an ASCII string. 

891 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 

892 """ 

893 if not isinstance(bs, (bytes, bytearray)): 

894 raise TypeError("quote_from_bytes() expected bytes") 

895 if not bs: 

896 return '' 

897 if isinstance(safe, str): 

898 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 

899 safe = safe.encode('ascii', 'ignore') 

900 else: 

901 safe = bytes([c for c in safe if c < 128]) 

902 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 

903 return bs.decode() 

904 try: 

905 quoter = _safe_quoters[safe] 

906 except KeyError: 

907 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 

908 return ''.join([quoter(char) for char in bs]) 

909 

910def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 

911 quote_via=quote_plus): 

912 """Encode a dict or sequence of two-element tuples into a URL query string. 

913 

914 If any values in the query arg are sequences and doseq is true, each 

915 sequence element is converted to a separate parameter. 

916 

917 If the query arg is a sequence of two-element tuples, the order of the 

918 parameters in the output will match the order of parameters in the 

919 input. 

920 

921 The components of a query arg may each be either a string or a bytes type. 

922 

923 The safe, encoding, and errors parameters are passed down to the function 

924 specified by quote_via (encoding and errors only if a component is a str). 

925 """ 

926 

927 if hasattr(query, "items"): 

928 query = query.items() 

929 else: 

930 # It's a bother at times that strings and string-like objects are 

931 # sequences. 

932 try: 

933 # non-sequence items should not work with len() 

934 # non-empty strings will fail this 

935 if len(query) and not isinstance(query[0], tuple): 

936 raise TypeError 

937 # Zero-length sequences of all types will get here and succeed, 

938 # but that's a minor nit. Since the original implementation 

939 # allowed empty dicts that type of behavior probably should be 

940 # preserved for consistency 

941 except TypeError: 

942 ty, va, tb = sys.exc_info() 

943 raise TypeError("not a valid non-string sequence " 

944 "or mapping object").with_traceback(tb) 

945 

946 l = [] 

947 if not doseq: 

948 for k, v in query: 

949 if isinstance(k, bytes): 

950 k = quote_via(k, safe) 

951 else: 

952 k = quote_via(str(k), safe, encoding, errors) 

953 

954 if isinstance(v, bytes): 

955 v = quote_via(v, safe) 

956 else: 

957 v = quote_via(str(v), safe, encoding, errors) 

958 l.append(k + '=' + v) 

959 else: 

960 for k, v in query: 

961 if isinstance(k, bytes): 

962 k = quote_via(k, safe) 

963 else: 

964 k = quote_via(str(k), safe, encoding, errors) 

965 

966 if isinstance(v, bytes): 

967 v = quote_via(v, safe) 

968 l.append(k + '=' + v) 

969 elif isinstance(v, str): 

970 v = quote_via(v, safe, encoding, errors) 

971 l.append(k + '=' + v) 

972 else: 

973 try: 

974 # Is this a sufficient test for sequence-ness? 

975 x = len(v) 

976 except TypeError: 

977 # not a sequence 

978 v = quote_via(str(v), safe, encoding, errors) 

979 l.append(k + '=' + v) 

980 else: 

981 # loop over the sequence 

982 for elt in v: 

983 if isinstance(elt, bytes): 

984 elt = quote_via(elt, safe) 

985 else: 

986 elt = quote_via(str(elt), safe, encoding, errors) 

987 l.append(k + '=' + elt) 

988 return '&'.join(l) 

989 

990 

991def to_bytes(url): 

992 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8", 

993 DeprecationWarning, stacklevel=2) 

994 return _to_bytes(url) 

995 

996 

997def _to_bytes(url): 

998 """to_bytes(u"URL") --> 'URL'.""" 

999 # Most URL schemes require ASCII. If that changes, the conversion 

1000 # can be relaxed. 

1001 # XXX get rid of to_bytes() 

1002 if isinstance(url, str): 

1003 try: 

1004 url = url.encode("ASCII").decode() 

1005 except UnicodeError: 

1006 raise UnicodeError("URL " + repr(url) + 

1007 " contains non-ASCII characters") 

1008 return url 

1009 

1010 

1011def unwrap(url): 

1012 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'. 

1013 

1014 The string is returned unchanged if it's not a wrapped URL. 

1015 """ 

1016 url = str(url).strip() 

1017 if url[:1] == '<' and url[-1:] == '>': 

1018 url = url[1:-1].strip() 

1019 if url[:4] == 'URL:': 

1020 url = url[4:].strip() 

1021 return url 

1022 

1023 

1024def splittype(url): 

1025 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, " 

1026 "use urllib.parse.urlparse() instead", 

1027 DeprecationWarning, stacklevel=2) 

1028 return _splittype(url) 

1029 

1030 

1031_typeprog = None 

1032def _splittype(url): 

1033 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 

1034 global _typeprog 

1035 if _typeprog is None: 

1036 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 

1037 

1038 match = _typeprog.match(url) 

1039 if match: 

1040 scheme, data = match.groups() 

1041 return scheme.lower(), data 

1042 return None, url 

1043 

1044 

1045def splithost(url): 

1046 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, " 

1047 "use urllib.parse.urlparse() instead", 

1048 DeprecationWarning, stacklevel=2) 

1049 return _splithost(url) 

1050 

1051 

1052_hostprog = None 

1053def _splithost(url): 

1054 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 

1055 global _hostprog 

1056 if _hostprog is None: 

1057 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 

1058 

1059 match = _hostprog.match(url) 

1060 if match: 

1061 host_port, path = match.groups() 

1062 if path and path[0] != '/': 

1063 path = '/' + path 

1064 return host_port, path 

1065 return None, url 

1066 

1067 

1068def splituser(host): 

1069 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, " 

1070 "use urllib.parse.urlparse() instead", 

1071 DeprecationWarning, stacklevel=2) 

1072 return _splituser(host) 

1073 

1074 

1075def _splituser(host): 

1076 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 

1077 user, delim, host = host.rpartition('@') 

1078 return (user if delim else None), host 

1079 

1080 

1081def splitpasswd(user): 

1082 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, " 

1083 "use urllib.parse.urlparse() instead", 

1084 DeprecationWarning, stacklevel=2) 

1085 return _splitpasswd(user) 

1086 

1087 

1088def _splitpasswd(user): 

1089 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 

1090 user, delim, passwd = user.partition(':') 

1091 return user, (passwd if delim else None) 

1092 

1093 

1094def splitport(host): 

1095 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, " 

1096 "use urllib.parse.urlparse() instead", 

1097 DeprecationWarning, stacklevel=2) 

1098 return _splitport(host) 

1099 

1100 

1101# splittag('/path#tag') --> '/path', 'tag' 

1102_portprog = None 

1103def _splitport(host): 

1104 """splitport('host:port') --> 'host', 'port'.""" 

1105 global _portprog 

1106 if _portprog is None: 

1107 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL) 

1108 

1109 match = _portprog.fullmatch(host) 

1110 if match: 

1111 host, port = match.groups() 

1112 if port: 

1113 return host, port 

1114 return host, None 

1115 

1116 

1117def splitnport(host, defport=-1): 

1118 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, " 

1119 "use urllib.parse.urlparse() instead", 

1120 DeprecationWarning, stacklevel=2) 

1121 return _splitnport(host, defport) 

1122 

1123 

1124def _splitnport(host, defport=-1): 

1125 """Split host and port, returning numeric port. 

1126 Return given default port if no ':' found; defaults to -1. 

1127 Return numerical port if a valid number are found after ':'. 

1128 Return None if ':' but not a valid number.""" 

1129 host, delim, port = host.rpartition(':') 

1130 if not delim: 

1131 host = port 

1132 elif port: 

1133 try: 

1134 nport = int(port) 

1135 except ValueError: 

1136 nport = None 

1137 return host, nport 

1138 return host, defport 

1139 

1140 

1141def splitquery(url): 

1142 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, " 

1143 "use urllib.parse.urlparse() instead", 

1144 DeprecationWarning, stacklevel=2) 

1145 return _splitquery(url) 

1146 

1147 

1148def _splitquery(url): 

1149 """splitquery('/path?query') --> '/path', 'query'.""" 

1150 path, delim, query = url.rpartition('?') 

1151 if delim: 

1152 return path, query 

1153 return url, None 

1154 

1155 

1156def splittag(url): 

1157 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, " 

1158 "use urllib.parse.urlparse() instead", 

1159 DeprecationWarning, stacklevel=2) 

1160 return _splittag(url) 

1161 

1162 

1163def _splittag(url): 

1164 """splittag('/path#tag') --> '/path', 'tag'.""" 

1165 path, delim, tag = url.rpartition('#') 

1166 if delim: 

1167 return path, tag 

1168 return url, None 

1169 

1170 

1171def splitattr(url): 

1172 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, " 

1173 "use urllib.parse.urlparse() instead", 

1174 DeprecationWarning, stacklevel=2) 

1175 return _splitattr(url) 

1176 

1177 

1178def _splitattr(url): 

1179 """splitattr('/path;attr1=value1;attr2=value2;...') -> 

1180 '/path', ['attr1=value1', 'attr2=value2', ...].""" 

1181 words = url.split(';') 

1182 return words[0], words[1:] 

1183 

1184 

1185def splitvalue(attr): 

1186 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, " 

1187 "use urllib.parse.parse_qsl() instead", 

1188 DeprecationWarning, stacklevel=2) 

1189 return _splitvalue(attr) 

1190 

1191 

1192def _splitvalue(attr): 

1193 """splitvalue('attr=value') --> 'attr', 'value'.""" 

1194 attr, delim, value = attr.partition('=') 

1195 return attr, (value if delim else None)