Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/parse.py: 25%

566 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1"""Parse (absolute and relative) URLs. 

2 

3urlparse module is based upon the following RFC specifications. 

4 

5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 

6and L. Masinter, January 2005. 

7 

8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 

9and L.Masinter, December 1999. 

10 

11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 

12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 

13 

14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 

15 

16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 

171995. 

18 

19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 

20McCahill, December 1994 

21 

22RFC 3986 is considered the current standard and any future changes to 

23urlparse module should conform with it. The urlparse module is 

24currently not entirely compliant with this RFC due to defacto 

25scenarios for parsing, and for backward compatibility purposes, some 

26parsing quirks from older RFCs are retained. The testcases in 

27test_urlparse.py provides a good indicator of parsing behavior. 

28""" 

29 

30import re 

31import sys 

32import collections 

33 

34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 

35 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 

36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 

37 "unquote", "unquote_plus", "unquote_to_bytes", 

38 "DefragResult", "ParseResult", "SplitResult", 

39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 

40 

41# A classification of schemes. 

42# The empty string classifies URLs with no scheme specified, 

43# being the default value returned by “urlsplit” and “urlparse”. 

44 

45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 

46 'wais', 'file', 'https', 'shttp', 'mms', 

47 'prospero', 'rtsp', 'rtspu', 'sftp', 

48 'svn', 'svn+ssh', 'ws', 'wss'] 

49 

50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 

51 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 

52 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 

53 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 

54 'ws', 'wss'] 

55 

56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 

57 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 

58 'mms', 'sftp', 'tel'] 

59 

60# These are not actually used anymore, but should stay for backwards 

61# compatibility. (They are undocumented, but have a public-looking name.) 

62 

63non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 

64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 

65 

66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 

67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 

68 

69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 

70 'nntp', 'wais', 'https', 'shttp', 'snews', 

71 'file', 'prospero'] 

72 

73# Characters valid in scheme names 

74scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 

75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 

76 '0123456789' 

77 '+-.') 

78 

79# Unsafe bytes to be removed per WHATWG spec 

80_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] 

81 

82# XXX: Consider replacing with functools.lru_cache 

83MAX_CACHE_SIZE = 20 

84_parse_cache = {} 

85 

86def clear_cache(): 

87 """Clear the parse cache and the quoters cache.""" 

88 _parse_cache.clear() 

89 _safe_quoters.clear() 

90 

91 

92# Helpers for bytes handling 

93# For 3.2, we deliberately require applications that 

94# handle improperly quoted URLs to do their own 

95# decoding and encoding. If valid use cases are 

96# presented, we may relax this by using latin-1 

97# decoding internally for 3.3 

98_implicit_encoding = 'ascii' 

99_implicit_errors = 'strict' 

100 

101def _noop(obj): 

102 return obj 

103 

104def _encode_result(obj, encoding=_implicit_encoding, 

105 errors=_implicit_errors): 

106 return obj.encode(encoding, errors) 

107 

108def _decode_args(args, encoding=_implicit_encoding, 

109 errors=_implicit_errors): 

110 return tuple(x.decode(encoding, errors) if x else '' for x in args) 

111 

112def _coerce_args(*args): 

113 # Invokes decode if necessary to create str args 

114 # and returns the coerced inputs along with 

115 # an appropriate result coercion function 

116 # - noop for str inputs 

117 # - encoding function otherwise 

118 str_input = isinstance(args[0], str) 

119 for arg in args[1:]: 

120 # We special-case the empty string to support the 

121 # "scheme=''" default argument to some functions 

122 if arg and isinstance(arg, str) != str_input: 

123 raise TypeError("Cannot mix str and non-str arguments") 

124 if str_input: 

125 return args + (_noop,) 

126 return _decode_args(args) + (_encode_result,) 

127 

128# Result objects are more helpful than simple tuples 

129class _ResultMixinStr(object): 

130 """Standard approach to encoding parsed results from str to bytes""" 

131 __slots__ = () 

132 

133 def encode(self, encoding='ascii', errors='strict'): 

134 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 

135 

136 

137class _ResultMixinBytes(object): 

138 """Standard approach to decoding parsed results from bytes to str""" 

139 __slots__ = () 

140 

141 def decode(self, encoding='ascii', errors='strict'): 

142 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 

143 

144 

145class _NetlocResultMixinBase(object): 

146 """Shared methods for the parsed result objects containing a netloc element""" 

147 __slots__ = () 

148 

149 @property 

150 def username(self): 

151 return self._userinfo[0] 

152 

153 @property 

154 def password(self): 

155 return self._userinfo[1] 

156 

157 @property 

158 def hostname(self): 

159 hostname = self._hostinfo[0] 

160 if not hostname: 

161 return None 

162 # Scoped IPv6 address may have zone info, which must not be lowercased 

163 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 

164 separator = '%' if isinstance(hostname, str) else b'%' 

165 hostname, percent, zone = hostname.partition(separator) 

166 return hostname.lower() + percent + zone 

167 

168 @property 

169 def port(self): 

170 port = self._hostinfo[1] 

171 if port is not None: 

172 port = int(port, 10) 

173 if not ( 0 <= port <= 65535): 

174 raise ValueError("Port out of range 0-65535") 

175 return port 

176 

177 

178class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 

179 __slots__ = () 

180 

181 @property 

182 def _userinfo(self): 

183 netloc = self.netloc 

184 userinfo, have_info, hostinfo = netloc.rpartition('@') 

185 if have_info: 

186 username, have_password, password = userinfo.partition(':') 

187 if not have_password: 

188 password = None 

189 else: 

190 username = password = None 

191 return username, password 

192 

193 @property 

194 def _hostinfo(self): 

195 netloc = self.netloc 

196 _, _, hostinfo = netloc.rpartition('@') 

197 _, have_open_br, bracketed = hostinfo.partition('[') 

198 if have_open_br: 

199 hostname, _, port = bracketed.partition(']') 

200 _, _, port = port.partition(':') 

201 else: 

202 hostname, _, port = hostinfo.partition(':') 

203 if not port: 

204 port = None 

205 return hostname, port 

206 

207 

208class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 

209 __slots__ = () 

210 

211 @property 

212 def _userinfo(self): 

213 netloc = self.netloc 

214 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 

215 if have_info: 

216 username, have_password, password = userinfo.partition(b':') 

217 if not have_password: 

218 password = None 

219 else: 

220 username = password = None 

221 return username, password 

222 

223 @property 

224 def _hostinfo(self): 

225 netloc = self.netloc 

226 _, _, hostinfo = netloc.rpartition(b'@') 

227 _, have_open_br, bracketed = hostinfo.partition(b'[') 

228 if have_open_br: 

229 hostname, _, port = bracketed.partition(b']') 

230 _, _, port = port.partition(b':') 

231 else: 

232 hostname, _, port = hostinfo.partition(b':') 

233 if not port: 

234 port = None 

235 return hostname, port 

236 

237 

238from collections import namedtuple 

239 

240_DefragResultBase = namedtuple('DefragResult', 'url fragment') 

241_SplitResultBase = namedtuple( 

242 'SplitResult', 'scheme netloc path query fragment') 

243_ParseResultBase = namedtuple( 

244 'ParseResult', 'scheme netloc path params query fragment') 

245 

246_DefragResultBase.__doc__ = """ 

247DefragResult(url, fragment) 

248 

249A 2-tuple that contains the url without fragment identifier and the fragment 

250identifier as a separate argument. 

251""" 

252 

253_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 

254 

255_DefragResultBase.fragment.__doc__ = """ 

256Fragment identifier separated from URL, that allows indirect identification of a 

257secondary resource by reference to a primary resource and additional identifying 

258information. 

259""" 

260 

261_SplitResultBase.__doc__ = """ 

262SplitResult(scheme, netloc, path, query, fragment) 

263 

264A 5-tuple that contains the different components of a URL. Similar to 

265ParseResult, but does not split params. 

266""" 

267 

268_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 

269 

270_SplitResultBase.netloc.__doc__ = """ 

271Network location where the request is made to. 

272""" 

273 

274_SplitResultBase.path.__doc__ = """ 

275The hierarchical path, such as the path to a file to download. 

276""" 

277 

278_SplitResultBase.query.__doc__ = """ 

279The query component, that contains non-hierarchical data, that along with data 

280in path component, identifies a resource in the scope of URI's scheme and 

281network location. 

282""" 

283 

284_SplitResultBase.fragment.__doc__ = """ 

285Fragment identifier, that allows indirect identification of a secondary resource 

286by reference to a primary resource and additional identifying information. 

287""" 

288 

289_ParseResultBase.__doc__ = """ 

290ParseResult(scheme, netloc, path, params, query, fragment) 

291 

292A 6-tuple that contains components of a parsed URL. 

293""" 

294 

295_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 

296_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 

297_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 

298_ParseResultBase.params.__doc__ = """ 

299Parameters for last path element used to dereference the URI in order to provide 

300access to perform some operation on the resource. 

301""" 

302 

303_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 

304_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 

305 

306 

307# For backwards compatibility, alias _NetlocResultMixinStr 

308# ResultBase is no longer part of the documented API, but it is 

309# retained since deprecating it isn't worth the hassle 

310ResultBase = _NetlocResultMixinStr 

311 

312# Structured result objects for string data 

313class DefragResult(_DefragResultBase, _ResultMixinStr): 

314 __slots__ = () 

315 def geturl(self): 

316 if self.fragment: 

317 return self.url + '#' + self.fragment 

318 else: 

319 return self.url 

320 

321class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 

322 __slots__ = () 

323 def geturl(self): 

324 return urlunsplit(self) 

325 

326class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 

327 __slots__ = () 

328 def geturl(self): 

329 return urlunparse(self) 

330 

331# Structured result objects for bytes data 

332class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 

333 __slots__ = () 

334 def geturl(self): 

335 if self.fragment: 

336 return self.url + b'#' + self.fragment 

337 else: 

338 return self.url 

339 

340class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 

341 __slots__ = () 

342 def geturl(self): 

343 return urlunsplit(self) 

344 

345class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 

346 __slots__ = () 

347 def geturl(self): 

348 return urlunparse(self) 

349 

350# Set up the encode/decode result pairs 

351def _fix_result_transcoding(): 

352 _result_pairs = ( 

353 (DefragResult, DefragResultBytes), 

354 (SplitResult, SplitResultBytes), 

355 (ParseResult, ParseResultBytes), 

356 ) 

357 for _decoded, _encoded in _result_pairs: 

358 _decoded._encoded_counterpart = _encoded 

359 _encoded._decoded_counterpart = _decoded 

360 

361_fix_result_transcoding() 

362del _fix_result_transcoding 

363 

364def urlparse(url, scheme='', allow_fragments=True): 

365 """Parse a URL into 6 components: 

366 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 

367 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 

368 Note that we don't break the components up in smaller bits 

369 (e.g. netloc is a single string) and we don't expand % escapes.""" 

370 url, scheme, _coerce_result = _coerce_args(url, scheme) 

371 splitresult = urlsplit(url, scheme, allow_fragments) 

372 scheme, netloc, url, query, fragment = splitresult 

373 if scheme in uses_params and ';' in url: 

374 url, params = _splitparams(url) 

375 else: 

376 params = '' 

377 result = ParseResult(scheme, netloc, url, params, query, fragment) 

378 return _coerce_result(result) 

379 

380def _splitparams(url): 

381 if '/' in url: 

382 i = url.find(';', url.rfind('/')) 

383 if i < 0: 

384 return url, '' 

385 else: 

386 i = url.find(';') 

387 return url[:i], url[i+1:] 

388 

389def _splitnetloc(url, start=0): 

390 delim = len(url) # position of end of domain part of url, default is end 

391 for c in '/?#': # look for delimiters; the order is NOT important 

392 wdelim = url.find(c, start) # find first of this delim 

393 if wdelim >= 0: # if found 

394 delim = min(delim, wdelim) # use earliest delim position 

395 return url[start:delim], url[delim:] # return (domain, rest) 

396 

397def _checknetloc(netloc): 

398 if not netloc or not any(ord(c) > 127 for c in netloc): 

399 return 

400 # looking for characters like \u2100 that expand to 'a/c' 

401 # IDNA uses NFKC equivalence, so normalize for this check 

402 import unicodedata 

403 n = netloc.replace('@', '') # ignore characters already included 

404 n = n.replace(':', '') # but not the surrounding text 

405 n = n.replace('#', '') 

406 n = n.replace('?', '') 

407 netloc2 = unicodedata.normalize('NFKC', n) 

408 if n == netloc2: 

409 return 

410 for c in '/?#@:': 

411 if c in netloc2: 

412 raise ValueError("netloc '" + netloc + "' contains invalid " + 

413 "characters under NFKC normalization") 

414 

415def _remove_unsafe_bytes_from_url(url): 

416 for b in _UNSAFE_URL_BYTES_TO_REMOVE: 

417 url = url.replace(b, "") 

418 return url 

419 

420def urlsplit(url, scheme='', allow_fragments=True): 

421 """Parse a URL into 5 components: 

422 <scheme>://<netloc>/<path>?<query>#<fragment> 

423 Return a 5-tuple: (scheme, netloc, path, query, fragment). 

424 Note that we don't break the components up in smaller bits 

425 (e.g. netloc is a single string) and we don't expand % escapes.""" 

426 url, scheme, _coerce_result = _coerce_args(url, scheme) 

427 url = _remove_unsafe_bytes_from_url(url) 

428 scheme = _remove_unsafe_bytes_from_url(scheme) 

429 allow_fragments = bool(allow_fragments) 

430 key = url, scheme, allow_fragments, type(url), type(scheme) 

431 cached = _parse_cache.get(key, None) 

432 if cached: 

433 return _coerce_result(cached) 

434 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 

435 clear_cache() 

436 netloc = query = fragment = '' 

437 i = url.find(':') 

438 if i > 0: 

439 if url[:i] == 'http': # optimize the common case 

440 scheme = url[:i].lower() 

441 url = url[i+1:] 

442 if url[:2] == '//': 

443 netloc, url = _splitnetloc(url, 2) 

444 if (('[' in netloc and ']' not in netloc) or 

445 (']' in netloc and '[' not in netloc)): 

446 raise ValueError("Invalid IPv6 URL") 

447 if allow_fragments and '#' in url: 

448 url, fragment = url.split('#', 1) 

449 if '?' in url: 

450 url, query = url.split('?', 1) 

451 _checknetloc(netloc) 

452 v = SplitResult(scheme, netloc, url, query, fragment) 

453 _parse_cache[key] = v 

454 return _coerce_result(v) 

455 for c in url[:i]: 

456 if c not in scheme_chars: 

457 break 

458 else: 

459 # make sure "url" is not actually a port number (in which case 

460 # "scheme" is really part of the path) 

461 rest = url[i+1:] 

462 if not rest or any(c not in '0123456789' for c in rest): 

463 # not a port number 

464 scheme, url = url[:i].lower(), rest 

465 

466 if url[:2] == '//': 

467 netloc, url = _splitnetloc(url, 2) 

468 if (('[' in netloc and ']' not in netloc) or 

469 (']' in netloc and '[' not in netloc)): 

470 raise ValueError("Invalid IPv6 URL") 

471 if allow_fragments and '#' in url: 

472 url, fragment = url.split('#', 1) 

473 if '?' in url: 

474 url, query = url.split('?', 1) 

475 _checknetloc(netloc) 

476 v = SplitResult(scheme, netloc, url, query, fragment) 

477 _parse_cache[key] = v 

478 return _coerce_result(v) 

479 

480def urlunparse(components): 

481 """Put a parsed URL back together again. This may result in a 

482 slightly different, but equivalent URL, if the URL that was parsed 

483 originally had redundant delimiters, e.g. a ? with an empty query 

484 (the draft states that these are equivalent).""" 

485 scheme, netloc, url, params, query, fragment, _coerce_result = ( 

486 _coerce_args(*components)) 

487 if params: 

488 url = "%s;%s" % (url, params) 

489 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 

490 

491def urlunsplit(components): 

492 """Combine the elements of a tuple as returned by urlsplit() into a 

493 complete URL as a string. The data argument can be any five-item iterable. 

494 This may result in a slightly different, but equivalent URL, if the URL that 

495 was parsed originally had unnecessary delimiters (for example, a ? with an 

496 empty query; the RFC states that these are equivalent).""" 

497 scheme, netloc, url, query, fragment, _coerce_result = ( 

498 _coerce_args(*components)) 

499 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 

500 if url and url[:1] != '/': url = '/' + url 

501 url = '//' + (netloc or '') + url 

502 if scheme: 

503 url = scheme + ':' + url 

504 if query: 

505 url = url + '?' + query 

506 if fragment: 

507 url = url + '#' + fragment 

508 return _coerce_result(url) 

509 

510def urljoin(base, url, allow_fragments=True): 

511 """Join a base URL and a possibly relative URL to form an absolute 

512 interpretation of the latter.""" 

513 if not base: 

514 return url 

515 if not url: 

516 return base 

517 

518 base, url, _coerce_result = _coerce_args(base, url) 

519 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 

520 urlparse(base, '', allow_fragments) 

521 scheme, netloc, path, params, query, fragment = \ 

522 urlparse(url, bscheme, allow_fragments) 

523 

524 if scheme != bscheme or scheme not in uses_relative: 

525 return _coerce_result(url) 

526 if scheme in uses_netloc: 

527 if netloc: 

528 return _coerce_result(urlunparse((scheme, netloc, path, 

529 params, query, fragment))) 

530 netloc = bnetloc 

531 

532 if not path and not params: 

533 path = bpath 

534 params = bparams 

535 if not query: 

536 query = bquery 

537 return _coerce_result(urlunparse((scheme, netloc, path, 

538 params, query, fragment))) 

539 

540 base_parts = bpath.split('/') 

541 if base_parts[-1] != '': 

542 # the last item is not a directory, so will not be taken into account 

543 # in resolving the relative path 

544 del base_parts[-1] 

545 

546 # for rfc3986, ignore all base path should the first character be root. 

547 if path[:1] == '/': 

548 segments = path.split('/') 

549 else: 

550 segments = base_parts + path.split('/') 

551 # filter out elements that would cause redundant slashes on re-joining 

552 # the resolved_path 

553 segments[1:-1] = filter(None, segments[1:-1]) 

554 

555 resolved_path = [] 

556 

557 for seg in segments: 

558 if seg == '..': 

559 try: 

560 resolved_path.pop() 

561 except IndexError: 

562 # ignore any .. segments that would otherwise cause an IndexError 

563 # when popped from resolved_path if resolving for rfc3986 

564 pass 

565 elif seg == '.': 

566 continue 

567 else: 

568 resolved_path.append(seg) 

569 

570 if segments[-1] in ('.', '..'): 

571 # do some post-processing here. if the last segment was a relative dir, 

572 # then we need to append the trailing '/' 

573 resolved_path.append('') 

574 

575 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 

576 resolved_path) or '/', params, query, fragment))) 

577 

578 

579def urldefrag(url): 

580 """Removes any existing fragment from URL. 

581 

582 Returns a tuple of the defragmented URL and the fragment. If 

583 the URL contained no fragments, the second element is the 

584 empty string. 

585 """ 

586 url, _coerce_result = _coerce_args(url) 

587 if '#' in url: 

588 s, n, p, a, q, frag = urlparse(url) 

589 defrag = urlunparse((s, n, p, a, q, '')) 

590 else: 

591 frag = '' 

592 defrag = url 

593 return _coerce_result(DefragResult(defrag, frag)) 

594 

595_hexdig = '0123456789ABCDEFabcdef' 

596_hextobyte = None 

597 

598def unquote_to_bytes(string): 

599 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 

600 # Note: strings are encoded as UTF-8. This is only an issue if it contains 

601 # unescaped non-ASCII characters, which URIs should not. 

602 if not string: 

603 # Is it a string-like object? 

604 string.split 

605 return b'' 

606 if isinstance(string, str): 

607 string = string.encode('utf-8') 

608 bits = string.split(b'%') 

609 if len(bits) == 1: 

610 return string 

611 res = [bits[0]] 

612 append = res.append 

613 # Delay the initialization of the table to not waste memory 

614 # if the function is never called 

615 global _hextobyte 

616 if _hextobyte is None: 

617 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)]) 

618 for a in _hexdig for b in _hexdig} 

619 for item in bits[1:]: 

620 try: 

621 append(_hextobyte[item[:2]]) 

622 append(item[2:]) 

623 except KeyError: 

624 append(b'%') 

625 append(item) 

626 return b''.join(res) 

627 

628_asciire = re.compile('([\x00-\x7f]+)') 

629 

630def unquote(string, encoding='utf-8', errors='replace'): 

631 """Replace %xx escapes by their single-character equivalent. The optional 

632 encoding and errors parameters specify how to decode percent-encoded 

633 sequences into Unicode characters, as accepted by the bytes.decode() 

634 method. 

635 By default, percent-encoded sequences are decoded with UTF-8, and invalid 

636 sequences are replaced by a placeholder character. 

637 

638 unquote('abc%20def') -> 'abc def'. 

639 """ 

640 if '%' not in string: 

641 string.split 

642 return string 

643 if encoding is None: 

644 encoding = 'utf-8' 

645 if errors is None: 

646 errors = 'replace' 

647 bits = _asciire.split(string) 

648 res = [bits[0]] 

649 append = res.append 

650 for i in range(1, len(bits), 2): 

651 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 

652 append(bits[i + 1]) 

653 return ''.join(res) 

654 

655 

656def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 

657 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 

658 """Parse a query given as a string argument. 

659 

660 Arguments: 

661 

662 qs: percent-encoded query string to be parsed 

663 

664 keep_blank_values: flag indicating whether blank values in 

665 percent-encoded queries should be treated as blank strings. 

666 A true value indicates that blanks should be retained as 

667 blank strings. The default false value indicates that 

668 blank values are to be ignored and treated as if they were 

669 not included. 

670 

671 strict_parsing: flag indicating what to do with parsing errors. 

672 If false (the default), errors are silently ignored. 

673 If true, errors raise a ValueError exception. 

674 

675 encoding and errors: specify how to decode percent-encoded sequences 

676 into Unicode characters, as accepted by the bytes.decode() method. 

677 

678 max_num_fields: int. If set, then throws a ValueError if there 

679 are more than n fields read by parse_qsl(). 

680 

681 separator: str. The symbol to use for separating the query arguments. 

682 Defaults to &. 

683 

684 Returns a dictionary. 

685 """ 

686 parsed_result = {} 

687 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 

688 encoding=encoding, errors=errors, 

689 max_num_fields=max_num_fields, separator=separator) 

690 for name, value in pairs: 

691 if name in parsed_result: 

692 parsed_result[name].append(value) 

693 else: 

694 parsed_result[name] = [value] 

695 return parsed_result 

696 

697 

698def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 

699 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 

700 """Parse a query given as a string argument. 

701 

702 Arguments: 

703 

704 qs: percent-encoded query string to be parsed 

705 

706 keep_blank_values: flag indicating whether blank values in 

707 percent-encoded queries should be treated as blank strings. 

708 A true value indicates that blanks should be retained as blank 

709 strings. The default false value indicates that blank values 

710 are to be ignored and treated as if they were not included. 

711 

712 strict_parsing: flag indicating what to do with parsing errors. If 

713 false (the default), errors are silently ignored. If true, 

714 errors raise a ValueError exception. 

715 

716 encoding and errors: specify how to decode percent-encoded sequences 

717 into Unicode characters, as accepted by the bytes.decode() method. 

718 

719 max_num_fields: int. If set, then throws a ValueError 

720 if there are more than n fields read by parse_qsl(). 

721 

722 separator: str. The symbol to use for separating the query arguments. 

723 Defaults to &. 

724 

725 Returns a list, as G-d intended. 

726 """ 

727 qs, _coerce_result = _coerce_args(qs) 

728 

729 if not separator or (not isinstance(separator, (str, bytes))): 

730 raise ValueError("Separator must be of type string or bytes.") 

731 

732 # If max_num_fields is defined then check that the number of fields 

733 # is less than max_num_fields. This prevents a memory exhaustion DOS 

734 # attack via post bodies with many fields. 

735 if max_num_fields is not None: 

736 num_fields = 1 + qs.count(separator) 

737 if max_num_fields < num_fields: 

738 raise ValueError('Max number of fields exceeded') 

739 

740 pairs = [s1 for s1 in qs.split(separator)] 

741 r = [] 

742 for name_value in pairs: 

743 if not name_value and not strict_parsing: 

744 continue 

745 nv = name_value.split('=', 1) 

746 if len(nv) != 2: 

747 if strict_parsing: 

748 raise ValueError("bad query field: %r" % (name_value,)) 

749 # Handle case of a control-name with no equal sign 

750 if keep_blank_values: 

751 nv.append('') 

752 else: 

753 continue 

754 if len(nv[1]) or keep_blank_values: 

755 name = nv[0].replace('+', ' ') 

756 name = unquote(name, encoding=encoding, errors=errors) 

757 name = _coerce_result(name) 

758 value = nv[1].replace('+', ' ') 

759 value = unquote(value, encoding=encoding, errors=errors) 

760 value = _coerce_result(value) 

761 r.append((name, value)) 

762 return r 

763 

764def unquote_plus(string, encoding='utf-8', errors='replace'): 

765 """Like unquote(), but also replace plus signs by spaces, as required for 

766 unquoting HTML form values. 

767 

768 unquote_plus('%7e/abc+def') -> '~/abc def' 

769 """ 

770 string = string.replace('+', ' ') 

771 return unquote(string, encoding, errors) 

772 

773_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 

774 b'abcdefghijklmnopqrstuvwxyz' 

775 b'0123456789' 

776 b'_.-') 

777_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 

778_safe_quoters = {} 

779 

780class Quoter(collections.defaultdict): 

781 """A mapping from bytes (in range(0,256)) to strings. 

782 

783 String values are percent-encoded byte values, unless the key < 128, and 

784 in the "safe" set (either the specified safe set, or default set). 

785 """ 

786 # Keeps a cache internally, using defaultdict, for efficiency (lookups 

787 # of cached keys don't call Python code at all). 

788 def __init__(self, safe): 

789 """safe: bytes object.""" 

790 self.safe = _ALWAYS_SAFE.union(safe) 

791 

792 def __repr__(self): 

793 # Without this, will just display as a defaultdict 

794 return "<%s %r>" % (self.__class__.__name__, dict(self)) 

795 

796 def __missing__(self, b): 

797 # Handle a cache miss. Store quoted string in cache and return. 

798 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 

799 self[b] = res 

800 return res 

801 

802def quote(string, safe='/', encoding=None, errors=None): 

803 """quote('abc def') -> 'abc%20def' 

804 

805 Each part of a URL, e.g. the path info, the query, etc., has a 

806 different set of reserved characters that must be quoted. 

807 

808 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 

809 the following reserved characters. 

810 

811 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 

812 "$" | "," 

813 

814 Each of these characters is reserved in some component of a URL, 

815 but not necessarily in all of them. 

816 

817 By default, the quote function is intended for quoting the path 

818 section of a URL. Thus, it will not encode '/'. This character 

819 is reserved, but in typical usage the quote function is being 

820 called on a path where the existing slash characters are used as 

821 reserved characters. 

822 

823 string and safe may be either str or bytes objects. encoding and errors 

824 must not be specified if string is a bytes object. 

825 

826 The optional encoding and errors parameters specify how to deal with 

827 non-ASCII characters, as accepted by the str.encode method. 

828 By default, encoding='utf-8' (characters are encoded with UTF-8), and 

829 errors='strict' (unsupported characters raise a UnicodeEncodeError). 

830 """ 

831 if isinstance(string, str): 

832 if not string: 

833 return string 

834 if encoding is None: 

835 encoding = 'utf-8' 

836 if errors is None: 

837 errors = 'strict' 

838 string = string.encode(encoding, errors) 

839 else: 

840 if encoding is not None: 

841 raise TypeError("quote() doesn't support 'encoding' for bytes") 

842 if errors is not None: 

843 raise TypeError("quote() doesn't support 'errors' for bytes") 

844 return quote_from_bytes(string, safe) 

845 

846def quote_plus(string, safe='', encoding=None, errors=None): 

847 """Like quote(), but also replace ' ' with '+', as required for quoting 

848 HTML form values. Plus signs in the original string are escaped unless 

849 they are included in safe. It also does not have safe default to '/'. 

850 """ 

851 # Check if ' ' in string, where string may either be a str or bytes. If 

852 # there are no spaces, the regular quote will produce the right answer. 

853 if ((isinstance(string, str) and ' ' not in string) or 

854 (isinstance(string, bytes) and b' ' not in string)): 

855 return quote(string, safe, encoding, errors) 

856 if isinstance(safe, str): 

857 space = ' ' 

858 else: 

859 space = b' ' 

860 string = quote(string, safe + space, encoding, errors) 

861 return string.replace(' ', '+') 

862 

863def quote_from_bytes(bs, safe='/'): 

864 """Like quote(), but accepts a bytes object rather than a str, and does 

865 not perform string-to-bytes encoding. It always returns an ASCII string. 

866 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 

867 """ 

868 if not isinstance(bs, (bytes, bytearray)): 

869 raise TypeError("quote_from_bytes() expected bytes") 

870 if not bs: 

871 return '' 

872 if isinstance(safe, str): 

873 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 

874 safe = safe.encode('ascii', 'ignore') 

875 else: 

876 safe = bytes([c for c in safe if c < 128]) 

877 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 

878 return bs.decode() 

879 try: 

880 quoter = _safe_quoters[safe] 

881 except KeyError: 

882 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 

883 return ''.join([quoter(char) for char in bs]) 

884 

885def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 

886 quote_via=quote_plus): 

887 """Encode a dict or sequence of two-element tuples into a URL query string. 

888 

889 If any values in the query arg are sequences and doseq is true, each 

890 sequence element is converted to a separate parameter. 

891 

892 If the query arg is a sequence of two-element tuples, the order of the 

893 parameters in the output will match the order of parameters in the 

894 input. 

895 

896 The components of a query arg may each be either a string or a bytes type. 

897 

898 The safe, encoding, and errors parameters are passed down to the function 

899 specified by quote_via (encoding and errors only if a component is a str). 

900 """ 

901 

902 if hasattr(query, "items"): 

903 query = query.items() 

904 else: 

905 # It's a bother at times that strings and string-like objects are 

906 # sequences. 

907 try: 

908 # non-sequence items should not work with len() 

909 # non-empty strings will fail this 

910 if len(query) and not isinstance(query[0], tuple): 

911 raise TypeError 

912 # Zero-length sequences of all types will get here and succeed, 

913 # but that's a minor nit. Since the original implementation 

914 # allowed empty dicts that type of behavior probably should be 

915 # preserved for consistency 

916 except TypeError: 

917 ty, va, tb = sys.exc_info() 

918 raise TypeError("not a valid non-string sequence " 

919 "or mapping object").with_traceback(tb) 

920 

921 l = [] 

922 if not doseq: 

923 for k, v in query: 

924 if isinstance(k, bytes): 

925 k = quote_via(k, safe) 

926 else: 

927 k = quote_via(str(k), safe, encoding, errors) 

928 

929 if isinstance(v, bytes): 

930 v = quote_via(v, safe) 

931 else: 

932 v = quote_via(str(v), safe, encoding, errors) 

933 l.append(k + '=' + v) 

934 else: 

935 for k, v in query: 

936 if isinstance(k, bytes): 

937 k = quote_via(k, safe) 

938 else: 

939 k = quote_via(str(k), safe, encoding, errors) 

940 

941 if isinstance(v, bytes): 

942 v = quote_via(v, safe) 

943 l.append(k + '=' + v) 

944 elif isinstance(v, str): 

945 v = quote_via(v, safe, encoding, errors) 

946 l.append(k + '=' + v) 

947 else: 

948 try: 

949 # Is this a sufficient test for sequence-ness? 

950 x = len(v) 

951 except TypeError: 

952 # not a sequence 

953 v = quote_via(str(v), safe, encoding, errors) 

954 l.append(k + '=' + v) 

955 else: 

956 # loop over the sequence 

957 for elt in v: 

958 if isinstance(elt, bytes): 

959 elt = quote_via(elt, safe) 

960 else: 

961 elt = quote_via(str(elt), safe, encoding, errors) 

962 l.append(k + '=' + elt) 

963 return '&'.join(l) 

964 

965def to_bytes(url): 

966 """to_bytes(u"URL") --> 'URL'.""" 

967 # Most URL schemes require ASCII. If that changes, the conversion 

968 # can be relaxed. 

969 # XXX get rid of to_bytes() 

970 if isinstance(url, str): 

971 try: 

972 url = url.encode("ASCII").decode() 

973 except UnicodeError: 

974 raise UnicodeError("URL " + repr(url) + 

975 " contains non-ASCII characters") 

976 return url 

977 

978def unwrap(url): 

979 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 

980 url = str(url).strip() 

981 if url[:1] == '<' and url[-1:] == '>': 

982 url = url[1:-1].strip() 

983 if url[:4] == 'URL:': url = url[4:].strip() 

984 return url 

985 

986_typeprog = None 

987def splittype(url): 

988 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 

989 global _typeprog 

990 if _typeprog is None: 

991 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 

992 

993 match = _typeprog.match(url) 

994 if match: 

995 scheme, data = match.groups() 

996 return scheme.lower(), data 

997 return None, url 

998 

999_hostprog = None 

1000def splithost(url): 

1001 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 

1002 global _hostprog 

1003 if _hostprog is None: 

1004 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 

1005 

1006 match = _hostprog.match(url) 

1007 if match: 

1008 host_port, path = match.groups() 

1009 if path and path[0] != '/': 

1010 path = '/' + path 

1011 return host_port, path 

1012 return None, url 

1013 

1014def splituser(host): 

1015 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 

1016 user, delim, host = host.rpartition('@') 

1017 return (user if delim else None), host 

1018 

1019def splitpasswd(user): 

1020 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 

1021 user, delim, passwd = user.partition(':') 

1022 return user, (passwd if delim else None) 

1023 

1024# splittag('/path#tag') --> '/path', 'tag' 

1025_portprog = None 

1026def splitport(host): 

1027 """splitport('host:port') --> 'host', 'port'.""" 

1028 global _portprog 

1029 if _portprog is None: 

1030 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) 

1031 

1032 match = _portprog.match(host) 

1033 if match: 

1034 host, port = match.groups() 

1035 if port: 

1036 return host, port 

1037 return host, None 

1038 

1039def splitnport(host, defport=-1): 

1040 """Split host and port, returning numeric port. 

1041 Return given default port if no ':' found; defaults to -1. 

1042 Return numerical port if a valid number are found after ':'. 

1043 Return None if ':' but not a valid number.""" 

1044 host, delim, port = host.rpartition(':') 

1045 if not delim: 

1046 host = port 

1047 elif port: 

1048 try: 

1049 nport = int(port) 

1050 except ValueError: 

1051 nport = None 

1052 return host, nport 

1053 return host, defport 

1054 

1055def splitquery(url): 

1056 """splitquery('/path?query') --> '/path', 'query'.""" 

1057 path, delim, query = url.rpartition('?') 

1058 if delim: 

1059 return path, query 

1060 return url, None 

1061 

1062def splittag(url): 

1063 """splittag('/path#tag') --> '/path', 'tag'.""" 

1064 path, delim, tag = url.rpartition('#') 

1065 if delim: 

1066 return path, tag 

1067 return url, None 

1068 

1069def splitattr(url): 

1070 """splitattr('/path;attr1=value1;attr2=value2;...') -> 

1071 '/path', ['attr1=value1', 'attr2=value2', ...].""" 

1072 words = url.split(';') 

1073 return words[0], words[1:] 

1074 

1075def splitvalue(attr): 

1076 """splitvalue('attr=value') --> 'attr', 'value'.""" 

1077 attr, delim, value = attr.partition('=') 

1078 return attr, (value if delim else None)