Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/urllib/request.py: 15%

1"""An extensible library for opening URLs using a variety of protocols

3The simplest way to use this module is to call the urlopen function,

4which accepts a string containing a URL or a Request object (described

5below). It opens the URL and returns the results as file-like

6object; the returned object has some extra methods described below.

8The OpenerDirector manages a collection of Handler objects that do

9all the actual work. Each Handler implements a particular protocol or

10option. The OpenerDirector is a composite object that invokes the

11Handlers needed to open the requested URL. For example, the

12HTTPHandler performs HTTP GET and POST requests and deals with

13non-error returns. The HTTPRedirectHandler automatically deals with

14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler

15deals with digest authentication.

17urlopen(url, data=None) -- Basic usage is the same as original

18urllib. pass the url and optionally data to post to an HTTP URL, and

19get a file-like object back. One difference is that you can also pass

20a Request instance instead of URL. Raises a URLError (subclass of

21OSError); for HTTP errors, raises an HTTPError, which can also be

22treated as a valid response.

24build_opener -- Function that creates a new OpenerDirector instance.

25Will install the default handlers. Accepts one or more Handlers as

26arguments, either instances or Handler classes that it will

27instantiate. If one of the argument is a subclass of the default

28handler, the argument will be installed instead of the default.

30install_opener -- Installs a new opener as the default opener.

32objects of interest:

34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages

35the Handler classes, while dealing with requests and responses.

37Request -- An object that encapsulates the state of a request. The

38state can be as simple as the URL. It can also include extra HTTP

39headers, e.g. a User-Agent.

41BaseHandler --

43internals:

44BaseHandler and parent

45_call_chain conventions

47Example usage:

49import urllib.request

51# set up authentication info

52authinfo = urllib.request.HTTPBasicAuthHandler()

53authinfo.add_password(realm='PDQ Application',

54 uri='https://mahler:8092/site-updates.py',

55 user='klem',

56 passwd='geheim$parole')

58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})

60# build a new opener that adds authentication and caching FTP handlers

61opener = urllib.request.build_opener(proxy_support, authinfo,

62 urllib.request.CacheFTPHandler)

64# install it

65urllib.request.install_opener(opener)

67f = urllib.request.urlopen('http://www.python.org/')

68"""

70# XXX issues:

71# If an authentication error handler that tries to perform

72# authentication for some reason but fails, how should the error be

73# signalled? The client needs to know the HTTP error code. But if

74# the handler knows that the problem was, e.g., that it didn't know

75# that hash algo that requested in the challenge, it would be good to

76# pass that information along to the client, too.

77# ftp errors aren't handled cleanly

78# check digest against correct (i.e. non-apache) implementation

80# Possible extensions:

81# complex proxies XXX not sure what exactly was meant by this

82# abstract factory for opener

84import base64

85import bisect

86import email

87import hashlib

88import http.client

89import io

90import os

91import posixpath

92import re

93import socket

94import string

95import sys

96import time

97import tempfile

98import contextlib

99import warnings

100

101

102from urllib.error import URLError, HTTPError, ContentTooShortError

103from urllib.parse import (

104 urlparse, urlsplit, urljoin, unwrap, quote, unquote,

105 _splittype, _splithost, _splitport, _splituser, _splitpasswd,

106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,

107 unquote_to_bytes, urlunparse)

108from urllib.response import addinfourl, addclosehook

109

110# check for SSL

111try:

112 import ssl

113except ImportError:

114 _have_ssl = False

115else:

116 _have_ssl = True

117

118__all__ = [

119 # Classes

120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',

121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',

122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',

123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',

124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',

125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',

126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',

127 'UnknownHandler', 'HTTPErrorProcessor',

128 # Functions

129 'urlopen', 'install_opener', 'build_opener',

130 'pathname2url', 'url2pathname', 'getproxies',

131 # Legacy interface

132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',

133]

134

135# used in User-Agent header sent

136__version__ = '%d.%d' % sys.version_info[:2]

137

138_opener = None

139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,

140 *, cafile=None, capath=None, cadefault=False, context=None):

141 '''Open the URL url, which can be either a string or a Request object.

142

143 *data* must be an object specifying additional data to be sent to

144 the server, or None if no such data is needed. See Request for

145 details.

146

147 urllib.request module uses HTTP/1.1 and includes a "Connection:close"

148 header in its HTTP requests.

149

150 The optional *timeout* parameter specifies a timeout in seconds for

151 blocking operations like the connection attempt (if not specified, the

152 global default timeout setting will be used). This only works for HTTP,

153 HTTPS and FTP connections.

154

155 If *context* is specified, it must be a ssl.SSLContext instance describing

156 the various SSL options. See HTTPSConnection for more details.

157

158 The optional *cafile* and *capath* parameters specify a set of trusted CA

159 certificates for HTTPS requests. cafile should point to a single file

160 containing a bundle of CA certificates, whereas capath should point to a

161 directory of hashed certificate files. More information can be found in

162 ssl.SSLContext.load_verify_locations().

163

164 The *cadefault* parameter is ignored.

165

166

167 This function always returns an object which can work as a

168 context manager and has the properties url, headers, and status.

169 See urllib.response.addinfourl for more detail on these properties.

170

171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse

172 object slightly modified. In addition to the three new methods above, the

173 msg attribute contains the same information as the reason attribute ---

174 the reason phrase returned by the server --- instead of the response

175 headers as it is specified in the documentation for HTTPResponse.

176

177 For FTP, file, and data URLs and requests explicitly handled by legacy

178 URLopener and FancyURLopener classes, this function returns a

179 urllib.response.addinfourl object.

180

181 Note that None may be returned if no handler handles the request (though

182 the default installed global OpenerDirector uses UnknownHandler to ensure

183 this never happens).

184

185 In addition, if proxy settings are detected (for example, when a *_proxy

186 environment variable like http_proxy is set), ProxyHandler is default

187 installed and makes sure the requests are handled through the proxy.

188

189 '''

190 global _opener

191 if cafile or capath or cadefault:

192 import warnings

193 warnings.warn("cafile, capath and cadefault are deprecated, use a "

194 "custom context instead.", DeprecationWarning, 2)

195 if context is not None:

196 raise ValueError(

197 "You can't pass both context and any of cafile, capath, and "

198 "cadefault"

199 )

200 if not _have_ssl:

201 raise ValueError('SSL support not available')

202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,

203 cafile=cafile,

204 capath=capath)

205 https_handler = HTTPSHandler(context=context)

206 opener = build_opener(https_handler)

207 elif context:

208 https_handler = HTTPSHandler(context=context)

209 opener = build_opener(https_handler)

210 elif _opener is None:

211 _opener = opener = build_opener()

212 else:

213 opener = _opener

214 return opener.open(url, data, timeout)

215

216def install_opener(opener):

217 global _opener

218 _opener = opener

219

220_url_tempfiles = []

221def urlretrieve(url, filename=None, reporthook=None, data=None):

222 """

223 Retrieve a URL into a temporary location on disk.

224

225 Requires a URL argument. If a filename is passed, it is used as

226 the temporary file location. The reporthook argument should be

227 a callable that accepts a block number, a read size, and the

228 total file size of the URL target. The data argument should be

229 valid URL encoded data.

230

231 If a filename is passed and the URL points to a local resource,

232 the result is a copy from local file to new file.

233

234 Returns a tuple containing the path to the newly created

235 data file as well as the resulting HTTPMessage object.

236 """

237 url_type, path = _splittype(url)

238

239 with contextlib.closing(urlopen(url, data)) as fp:

240 headers = fp.info()

241

242 # Just return the local path and the "headers" for file://

243 # URLs. No sense in performing a copy unless requested.

244 if url_type == "file" and not filename:

245 return os.path.normpath(path), headers

246

247 # Handle temporary file setup.

248 if filename:

249 tfp = open(filename, 'wb')

250 else:

251 tfp = tempfile.NamedTemporaryFile(delete=False)

252 filename = tfp.name

253 _url_tempfiles.append(filename)

254

255 with tfp:

256 result = filename, headers

257 bs = 1024*8

258 size = -1

259 read = 0

260 blocknum = 0

261 if "content-length" in headers:

262 size = int(headers["Content-Length"])

263

264 if reporthook:

265 reporthook(blocknum, bs, size)

266

267 while True:

268 block = fp.read(bs)

269 if not block:

270 break

271 read += len(block)

272 tfp.write(block)

273 blocknum += 1

274 if reporthook:

275 reporthook(blocknum, bs, size)

276

277 if size >= 0 and read < size:

278 raise ContentTooShortError(

279 "retrieval incomplete: got only %i out of %i bytes"

280 % (read, size), result)

281

282 return result

283

284def urlcleanup():

285 """Clean up temporary files from urlretrieve calls."""

286 for temp_file in _url_tempfiles:

287 try:

288 os.unlink(temp_file)

289 except OSError:

290 pass

291

292 del _url_tempfiles[:]

293 global _opener

294 if _opener:

295 _opener = None

296

297# copied from cookielib.py

298_cut_port_re = re.compile(r":\d+$", re.ASCII)

299def request_host(request):

300 """Return request-host, as defined by RFC 2965.

301

302 Variation from RFC: returned value is lowercased, for convenient

303 comparison.

304

305 """

306 url = request.full_url

307 host = urlparse(url)[1]

308 if host == "":

309 host = request.get_header("Host", "")

310

311 # remove port, if present

312 host = _cut_port_re.sub("", host, 1)

313 return host.lower()

314

315class Request:

316

317 def __init__(self, url, data=None, headers={},

318 origin_req_host=None, unverifiable=False,

319 method=None):

320 self.full_url = url

321 self.headers = {}

322 self.unredirected_hdrs = {}

323 self._data = None

324 self.data = data

325 self._tunnel_host = None

326 for key, value in headers.items():

327 self.add_header(key, value)

328 if origin_req_host is None:

329 origin_req_host = request_host(self)

330 self.origin_req_host = origin_req_host

331 self.unverifiable = unverifiable

332 if method:

333 self.method = method

334

335 @property

336 def full_url(self):

337 if self.fragment:

338 return '{}#{}'.format(self._full_url, self.fragment)

339 return self._full_url

340

341 @full_url.setter

342 def full_url(self, url):

343 # unwrap('<URL:type://host/path>') --> 'type://host/path'

344 self._full_url = unwrap(url)

345 self._full_url, self.fragment = _splittag(self._full_url)

346 self._parse()

347

348 @full_url.deleter

349 def full_url(self):

350 self._full_url = None

351 self.fragment = None

352 self.selector = ''

353

354 @property

355 def data(self):

356 return self._data

357

358 @data.setter

359 def data(self, data):

360 if data != self._data:

361 self._data = data

362 # issue 16464

363 # if we change data we need to remove content-length header

364 # (cause it's most probably calculated for previous value)

365 if self.has_header("Content-length"):

366 self.remove_header("Content-length")

367

368 @data.deleter

369 def data(self):

370 self.data = None

371

372 def _parse(self):

373 self.type, rest = _splittype(self._full_url)

374 if self.type is None:

375 raise ValueError("unknown url type: %r" % self.full_url)

376 self.host, self.selector = _splithost(rest)

377 if self.host:

378 self.host = unquote(self.host)

379

380 def get_method(self):

381 """Return a string indicating the HTTP request method."""

382 default_method = "POST" if self.data is not None else "GET"

383 return getattr(self, 'method', default_method)

384

385 def get_full_url(self):

386 return self.full_url

387

388 def set_proxy(self, host, type):

389 if self.type == 'https' and not self._tunnel_host:

390 self._tunnel_host = self.host

391 else:

392 self.type= type

393 self.selector = self.full_url

394 self.host = host

395

396 def has_proxy(self):

397 return self.selector == self.full_url

398

399 def add_header(self, key, val):

400 # useful for something like authentication

401 self.headers[key.capitalize()] = val

402

403 def add_unredirected_header(self, key, val):

404 # will not be added to a redirected request

405 self.unredirected_hdrs[key.capitalize()] = val

406

407 def has_header(self, header_name):

408 return (header_name in self.headers or

409 header_name in self.unredirected_hdrs)

410

411 def get_header(self, header_name, default=None):

412 return self.headers.get(

413 header_name,

414 self.unredirected_hdrs.get(header_name, default))

415

416 def remove_header(self, header_name):

417 self.headers.pop(header_name, None)

418 self.unredirected_hdrs.pop(header_name, None)

419

420 def header_items(self):

421 hdrs = {**self.unredirected_hdrs, **self.headers}

422 return list(hdrs.items())

423

424class OpenerDirector:

425 def __init__(self):

426 client_version = "Python-urllib/%s" % __version__

427 self.addheaders = [('User-agent', client_version)]

428 # self.handlers is retained only for backward compatibility

429 self.handlers = []

430 # manage the individual handlers

431 self.handle_open = {}

432 self.handle_error = {}

433 self.process_response = {}

434 self.process_request = {}

435

436 def add_handler(self, handler):

437 if not hasattr(handler, "add_parent"):

438 raise TypeError("expected BaseHandler instance, got %r" %

439 type(handler))

440

441 added = False

442 for meth in dir(handler):

443 if meth in ["redirect_request", "do_open", "proxy_open"]:

444 # oops, coincidental match

445 continue

446

447 i = meth.find("_")

448 protocol = meth[:i]

449 condition = meth[i+1:]

450

451 if condition.startswith("error"):

452 j = condition.find("_") + i + 1

453 kind = meth[j+1:]

454 try:

455 kind = int(kind)

456 except ValueError:

457 pass

458 lookup = self.handle_error.get(protocol, {})

459 self.handle_error[protocol] = lookup

460 elif condition == "open":

461 kind = protocol

462 lookup = self.handle_open

463 elif condition == "response":

464 kind = protocol

465 lookup = self.process_response

466 elif condition == "request":

467 kind = protocol

468 lookup = self.process_request

469 else:

470 continue

471

472 handlers = lookup.setdefault(kind, [])

473 if handlers:

474 bisect.insort(handlers, handler)

475 else:

476 handlers.append(handler)

477 added = True

478

479 if added:

480 bisect.insort(self.handlers, handler)

481 handler.add_parent(self)

482

483 def close(self):

484 # Only exists for backwards compatibility.

485 pass

486

487 def _call_chain(self, chain, kind, meth_name, *args):

488 # Handlers raise an exception if no one else should try to handle

489 # the request, or return None if they can't but another handler

490 # could. Otherwise, they return the response.

491 handlers = chain.get(kind, ())

492 for handler in handlers:

493 func = getattr(handler, meth_name)

494 result = func(*args)

495 if result is not None:

496 return result

497

498 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):

499 # accept a URL or a Request object

500 if isinstance(fullurl, str):

501 req = Request(fullurl, data)

502 else:

503 req = fullurl

504 if data is not None:

505 req.data = data

506

507 req.timeout = timeout

508 protocol = req.type

509

510 # pre-process request

511 meth_name = protocol+"_request"

512 for processor in self.process_request.get(protocol, []):

513 meth = getattr(processor, meth_name)

514 req = meth(req)

515

516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())

517 response = self._open(req, data)

518

519 # post-process response

520 meth_name = protocol+"_response"

521 for processor in self.process_response.get(protocol, []):

522 meth = getattr(processor, meth_name)

523 response = meth(req, response)

524

525 return response

526

527 def _open(self, req, data=None):

528 result = self._call_chain(self.handle_open, 'default',

529 'default_open', req)

530 if result:

531 return result

532

533 protocol = req.type

534 result = self._call_chain(self.handle_open, protocol, protocol +

535 '_open', req)

536 if result:

537 return result

538

539 return self._call_chain(self.handle_open, 'unknown',

540 'unknown_open', req)

541

542 def error(self, proto, *args):

543 if proto in ('http', 'https'):

544 # XXX http[s] protocols are special-cased

545 dict = self.handle_error['http'] # https is not different than http

546 proto = args[2] # YUCK!

547 meth_name = 'http_error_%s' % proto

548 http_err = 1

549 orig_args = args

550 else:

551 dict = self.handle_error

552 meth_name = proto + '_error'

553 http_err = 0

554 args = (dict, proto, meth_name) + args

555 result = self._call_chain(*args)

556 if result:

557 return result

558

559 if http_err:

560 args = (dict, 'default', 'http_error_default') + orig_args

561 return self._call_chain(*args)

562

563# XXX probably also want an abstract factory that knows when it makes

564# sense to skip a superclass in favor of a subclass and when it might

565# make sense to include both

566

567def build_opener(*handlers):

568 """Create an opener object from a list of handlers.

569

570 The opener will use several default handlers, including support

571 for HTTP, FTP and when applicable HTTPS.

572

573 If any of the handlers passed as arguments are subclasses of the

574 default handlers, the default handlers will not be used.

575 """

576 opener = OpenerDirector()

577 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,

578 HTTPDefaultErrorHandler, HTTPRedirectHandler,

579 FTPHandler, FileHandler, HTTPErrorProcessor,

580 DataHandler]

581 if hasattr(http.client, "HTTPSConnection"):

582 default_classes.append(HTTPSHandler)

583 skip = set()

584 for klass in default_classes:

585 for check in handlers:

586 if isinstance(check, type):

587 if issubclass(check, klass):

588 skip.add(klass)

589 elif isinstance(check, klass):

590 skip.add(klass)

591 for klass in skip:

592 default_classes.remove(klass)

593

594 for klass in default_classes:

595 opener.add_handler(klass())

596

597 for h in handlers:

598 if isinstance(h, type):

599 h = h()

600 opener.add_handler(h)

601 return opener

602

603class BaseHandler:

604 handler_order = 500

605

606 def add_parent(self, parent):

607 self.parent = parent

608

609 def close(self):

610 # Only exists for backwards compatibility

611 pass

612

613 def __lt__(self, other):

614 if not hasattr(other, "handler_order"):

615 # Try to preserve the old behavior of having custom classes

616 # inserted after default ones (works only for custom user

617 # classes which are not aware of handler_order).

618 return True

619 return self.handler_order < other.handler_order

620

621

622class HTTPErrorProcessor(BaseHandler):

623 """Process HTTP error responses."""

624 handler_order = 1000 # after all other processing

625

626 def http_response(self, request, response):

627 code, msg, hdrs = response.code, response.msg, response.info()

628

629 # According to RFC 2616, "2xx" code indicates that the client's

630 # request was successfully received, understood, and accepted.

631 if not (200 <= code < 300):

632 response = self.parent.error(

633 'http', request, response, code, msg, hdrs)

634

635 return response

636

637 https_response = http_response

638

639class HTTPDefaultErrorHandler(BaseHandler):

640 def http_error_default(self, req, fp, code, msg, hdrs):

641 raise HTTPError(req.full_url, code, msg, hdrs, fp)

642

643class HTTPRedirectHandler(BaseHandler):

644 # maximum number of redirections to any single URL

645 # this is needed because of the state that cookies introduce

646 max_repeats = 4

647 # maximum total number of redirections (regardless of URL) before

648 # assuming we're in a loop

649 max_redirections = 10

650

651 def redirect_request(self, req, fp, code, msg, headers, newurl):

652 """Return a Request or None in response to a redirect.

653

654 This is called by the http_error_30x methods when a

655 redirection response is received. If a redirection should

656 take place, return a new Request to allow http_error_30x to

657 perform the redirect. Otherwise, raise HTTPError if no-one

658 else should try to handle this url. Return None if you can't

659 but another Handler might.

660 """

661 m = req.get_method()

662 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")

663 or code in (301, 302, 303) and m == "POST")):

664 raise HTTPError(req.full_url, code, msg, headers, fp)

665

666 # Strictly (according to RFC 2616), 301 or 302 in response to

667 # a POST MUST NOT cause a redirection without confirmation

668 # from the user (of urllib.request, in this case). In practice,

669 # essentially all clients do redirect in this case, so we do

670 # the same.

671

672 # Be conciliant with URIs containing a space. This is mainly

673 # redundant with the more complete encoding done in http_error_302(),

674 # but it is kept for compatibility with other callers.

675 newurl = newurl.replace(' ', '%20')

676

677 CONTENT_HEADERS = ("content-length", "content-type")

678 newheaders = {k: v for k, v in req.headers.items()

679 if k.lower() not in CONTENT_HEADERS}

680 return Request(newurl,

681 headers=newheaders,

682 origin_req_host=req.origin_req_host,

683 unverifiable=True)

684

685 # Implementation note: To avoid the server sending us into an

686 # infinite loop, the request object needs to track what URLs we

687 # have already seen. Do this by adding a handler-specific

688 # attribute to the Request object.

689 def http_error_302(self, req, fp, code, msg, headers):

690 # Some servers (incorrectly) return multiple Location headers

691 # (so probably same goes for URI). Use first header.

692 if "location" in headers:

693 newurl = headers["location"]

694 elif "uri" in headers:

695 newurl = headers["uri"]

696 else:

697 return

698

699 # fix a possible malformed URL

700 urlparts = urlparse(newurl)

701

702 # For security reasons we don't allow redirection to anything other

703 # than http, https or ftp.

704

705 if urlparts.scheme not in ('http', 'https', 'ftp', ''):

706 raise HTTPError(

707 newurl, code,

708 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),

709 headers, fp)

710

711 if not urlparts.path and urlparts.netloc:

712 urlparts = list(urlparts)

713 urlparts[2] = "/"

714 newurl = urlunparse(urlparts)

715

716 # http.client.parse_headers() decodes as ISO-8859-1. Recover the

717 # original bytes and percent-encode non-ASCII bytes, and any special

718 # characters such as the space.

719 newurl = quote(

720 newurl, encoding="iso-8859-1", safe=string.punctuation)

721 newurl = urljoin(req.full_url, newurl)

722

723 # XXX Probably want to forget about the state of the current

724 # request, although that might interact poorly with other

725 # handlers that also use handler-specific request attributes

726 new = self.redirect_request(req, fp, code, msg, headers, newurl)

727 if new is None:

728 return

729

730 # loop detection

731 # .redirect_dict has a key url if url was previously visited.

732 if hasattr(req, 'redirect_dict'):

733 visited = new.redirect_dict = req.redirect_dict

734 if (visited.get(newurl, 0) >= self.max_repeats or

735 len(visited) >= self.max_redirections):

736 raise HTTPError(req.full_url, code,

737 self.inf_msg + msg, headers, fp)

738 else:

739 visited = new.redirect_dict = req.redirect_dict = {}

740 visited[newurl] = visited.get(newurl, 0) + 1

741

742 # Don't close the fp until we are sure that we won't use it

743 # with HTTPError.

744 fp.read()

745 fp.close()

746

747 return self.parent.open(new, timeout=req.timeout)

748

749 http_error_301 = http_error_303 = http_error_307 = http_error_302

750

751 inf_msg = "The HTTP server returned a redirect error that would " \

752 "lead to an infinite loop.\n" \

753 "The last 30x error message was:\n"

754

755

756def _parse_proxy(proxy):

757 """Return (scheme, user, password, host/port) given a URL or an authority.

758

759 If a URL is supplied, it must have an authority (host:port) component.

760 According to RFC 3986, having an authority component means the URL must

761 have two slashes after the scheme.

762 """

763 scheme, r_scheme = _splittype(proxy)

764 if not r_scheme.startswith("/"):

765 # authority

766 scheme = None

767 authority = proxy

768 else:

769 # URL

770 if not r_scheme.startswith("//"):

771 raise ValueError("proxy URL with no authority: %r" % proxy)

772 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.

773 # and 3.3.), path is empty or starts with '/'

774 if '@' in r_scheme:

775 host_separator = r_scheme.find('@')

776 end = r_scheme.find("/", host_separator)

777 else:

778 end = r_scheme.find("/", 2)

779 if end == -1:

780 end = None

781 authority = r_scheme[2:end]

782 userinfo, hostport = _splituser(authority)

783 if userinfo is not None:

784 user, password = _splitpasswd(userinfo)

785 else:

786 user = password = None

787 return scheme, user, password, hostport

788

789class ProxyHandler(BaseHandler):

790 # Proxies must be in front

791 handler_order = 100

792

793 def __init__(self, proxies=None):

794 if proxies is None:

795 proxies = getproxies()

796 assert hasattr(proxies, 'keys'), "proxies must be a mapping"

797 self.proxies = proxies

798 for type, url in proxies.items():

799 type = type.lower()

800 setattr(self, '%s_open' % type,

801 lambda r, proxy=url, type=type, meth=self.proxy_open:

802 meth(r, proxy, type))

803

804 def proxy_open(self, req, proxy, type):

805 orig_type = req.type

806 proxy_type, user, password, hostport = _parse_proxy(proxy)

807 if proxy_type is None:

808 proxy_type = orig_type

809

810 if req.host and proxy_bypass(req.host):

811 return None

812

813 if user and password:

814 user_pass = '%s:%s' % (unquote(user),

815 unquote(password))

816 creds = base64.b64encode(user_pass.encode()).decode("ascii")

817 req.add_header('Proxy-authorization', 'Basic ' + creds)

818 hostport = unquote(hostport)

819 req.set_proxy(hostport, proxy_type)

820 if orig_type == proxy_type or orig_type == 'https':

821 # let other handlers take care of it

822 return None

823 else:

824 # need to start over, because the other handlers don't

825 # grok the proxy's URL type

826 # e.g. if we have a constructor arg proxies like so:

827 # {'http': 'ftp://proxy.example.com'}, we may end up turning

828 # a request for http://acme.example.com/a into one for

829 # ftp://proxy.example.com/a

830 return self.parent.open(req, timeout=req.timeout)

831

832class HTTPPasswordMgr:

833

834 def __init__(self):

835 self.passwd = {}

836

837 def add_password(self, realm, uri, user, passwd):

838 # uri could be a single URI or a sequence

839 if isinstance(uri, str):

840 uri = [uri]

841 if realm not in self.passwd:

842 self.passwd[realm] = {}

843 for default_port in True, False:

844 reduced_uri = tuple(

845 self.reduce_uri(u, default_port) for u in uri)

846 self.passwd[realm][reduced_uri] = (user, passwd)

847

848 def find_user_password(self, realm, authuri):

849 domains = self.passwd.get(realm, {})

850 for default_port in True, False:

851 reduced_authuri = self.reduce_uri(authuri, default_port)

852 for uris, authinfo in domains.items():

853 for uri in uris:

854 if self.is_suburi(uri, reduced_authuri):

855 return authinfo

856 return None, None

857

858 def reduce_uri(self, uri, default_port=True):

859 """Accept authority or URI and extract only the authority and path."""

860 # note HTTP URLs do not have a userinfo component

861 parts = urlsplit(uri)

862 if parts[1]:

863 # URI

864 scheme = parts[0]

865 authority = parts[1]

866 path = parts[2] or '/'

867 else:

868 # host or host:port

869 scheme = None

870 authority = uri

871 path = '/'

872 host, port = _splitport(authority)

873 if default_port and port is None and scheme is not None:

874 dport = {"http": 80,

875 "https": 443,

876 }.get(scheme)

877 if dport is not None:

878 authority = "%s:%d" % (host, dport)

879 return authority, path

880

881 def is_suburi(self, base, test):

882 """Check if test is below base in a URI tree

883

884 Both args must be URIs in reduced form.

885 """

886 if base == test:

887 return True

888 if base[0] != test[0]:

889 return False

890 common = posixpath.commonprefix((base[1], test[1]))

891 if len(common) == len(base[1]):

892 return True

893 return False

894

895

896class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):

897

898 def find_user_password(self, realm, authuri):

899 user, password = HTTPPasswordMgr.find_user_password(self, realm,

900 authuri)

901 if user is not None:

902 return user, password

903 return HTTPPasswordMgr.find_user_password(self, None, authuri)

904

905

906class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):

907

908 def __init__(self, *args, **kwargs):

909 self.authenticated = {}

910 super().__init__(*args, **kwargs)

911

912 def add_password(self, realm, uri, user, passwd, is_authenticated=False):

913 self.update_authenticated(uri, is_authenticated)

914 # Add a default for prior auth requests

915 if realm is not None:

916 super().add_password(None, uri, user, passwd)

917 super().add_password(realm, uri, user, passwd)

918

919 def update_authenticated(self, uri, is_authenticated=False):

920 # uri could be a single URI or a sequence

921 if isinstance(uri, str):

922 uri = [uri]

923

924 for default_port in True, False:

925 for u in uri:

926 reduced_uri = self.reduce_uri(u, default_port)

927 self.authenticated[reduced_uri] = is_authenticated

928

929 def is_authenticated(self, authuri):

930 for default_port in True, False:

931 reduced_authuri = self.reduce_uri(authuri, default_port)

932 for uri in self.authenticated:

933 if self.is_suburi(uri, reduced_authuri):

934 return self.authenticated[uri]

935

936

937class AbstractBasicAuthHandler:

938

939 # XXX this allows for multiple auth-schemes, but will stupidly pick

940 # the last one with a realm specified.

941

942 # allow for double- and single-quoted realm values

943 # (single quotes are a violation of the RFC, but appear in the wild)

944 rx = re.compile('(?:^|,)' # start of the string or ','

945 '[ \t]*' # optional whitespaces

946 '([^ \t,]+)' # scheme like "Basic"

947 '[ \t]+' # mandatory whitespaces

948 # realm=xxx

949 # realm='xxx'

950 # realm="xxx"

951 'realm=(["\']?)([^"\']*)\\2',

952 re.I)

953

954 # XXX could pre-emptively send auth info already accepted (RFC 2617,

955 # end of section 2, and section 1.2 immediately after "credentials"

956 # production).

957

958 def __init__(self, password_mgr=None):

959 if password_mgr is None:

960 password_mgr = HTTPPasswordMgr()

961 self.passwd = password_mgr

962 self.add_password = self.passwd.add_password

963

964 def _parse_realm(self, header):

965 # parse WWW-Authenticate header: accept multiple challenges per header

966 found_challenge = False

967 for mo in AbstractBasicAuthHandler.rx.finditer(header):

968 scheme, quote, realm = mo.groups()

969 if quote not in ['"', "'"]:

970 warnings.warn("Basic Auth Realm was unquoted",

971 UserWarning, 3)

972

973 yield (scheme, realm)

974

975 found_challenge = True

976

977 if not found_challenge:

978 if header:

979 scheme = header.split()[0]

980 else:

981 scheme = ''

982 yield (scheme, None)

983

984 def http_error_auth_reqed(self, authreq, host, req, headers):

985 # host may be an authority (without userinfo) or a URL with an

986 # authority

987 headers = headers.get_all(authreq)

988 if not headers:

989 # no header found

990 return

991

992 unsupported = None

993 for header in headers:

994 for scheme, realm in self._parse_realm(header):

995 if scheme.lower() != 'basic':

996 unsupported = scheme

997 continue

998

999 if realm is not None:

1000 # Use the first matching Basic challenge.

1001 # Ignore following challenges even if they use the Basic

1002 # scheme.

1003 return self.retry_http_basic_auth(host, req, realm)

1004

1005 if unsupported is not None:

1006 raise ValueError("AbstractBasicAuthHandler does not "

1007 "support the following scheme: %r"

1008 % (scheme,))

1009

1010 def retry_http_basic_auth(self, host, req, realm):

1011 user, pw = self.passwd.find_user_password(realm, host)

1012 if pw is not None:

1013 raw = "%s:%s" % (user, pw)

1014 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")

1015 if req.get_header(self.auth_header, None) == auth:

1016 return None

1017 req.add_unredirected_header(self.auth_header, auth)

1018 return self.parent.open(req, timeout=req.timeout)

1019 else:

1020 return None

1021

1022 def http_request(self, req):

1023 if (not hasattr(self.passwd, 'is_authenticated') or

1024 not self.passwd.is_authenticated(req.full_url)):

1025 return req

1026

1027 if not req.has_header('Authorization'):

1028 user, passwd = self.passwd.find_user_password(None, req.full_url)

1029 credentials = '{0}:{1}'.format(user, passwd).encode()

1030 auth_str = base64.standard_b64encode(credentials).decode()

1031 req.add_unredirected_header('Authorization',

1032 'Basic {}'.format(auth_str.strip()))

1033 return req

1034

1035 def http_response(self, req, response):

1036 if hasattr(self.passwd, 'is_authenticated'):

1037 if 200 <= response.code < 300:

1038 self.passwd.update_authenticated(req.full_url, True)

1039 else:

1040 self.passwd.update_authenticated(req.full_url, False)

1041 return response

1042

1043 https_request = http_request

1044 https_response = http_response

1045

1046

1047

1048class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):

1049

1050 auth_header = 'Authorization'

1051

1052 def http_error_401(self, req, fp, code, msg, headers):

1053 url = req.full_url

1054 response = self.http_error_auth_reqed('www-authenticate',

1055 url, req, headers)

1056 return response

1057

1058

1059class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):

1060

1061 auth_header = 'Proxy-authorization'

1062

1063 def http_error_407(self, req, fp, code, msg, headers):

1064 # http_error_auth_reqed requires that there is no userinfo component in

1065 # authority. Assume there isn't one, since urllib.request does not (and

1066 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing

1067 # userinfo.

1068 authority = req.host

1069 response = self.http_error_auth_reqed('proxy-authenticate',

1070 authority, req, headers)

1071 return response

1072

1073

1074# Return n random bytes.

1075_randombytes = os.urandom

1076

1077

1078class AbstractDigestAuthHandler:

1079 # Digest authentication is specified in RFC 2617.

1080

1081 # XXX The client does not inspect the Authentication-Info header

1082 # in a successful response.

1083

1084 # XXX It should be possible to test this implementation against

1085 # a mock server that just generates a static set of challenges.

1086

1087 # XXX qop="auth-int" supports is shaky

1088

1089 def __init__(self, passwd=None):

1090 if passwd is None:

1091 passwd = HTTPPasswordMgr()

1092 self.passwd = passwd

1093 self.add_password = self.passwd.add_password

1094 self.retried = 0

1095 self.nonce_count = 0

1096 self.last_nonce = None

1097

1098 def reset_retry_count(self):

1099 self.retried = 0

1100

1101 def http_error_auth_reqed(self, auth_header, host, req, headers):

1102 authreq = headers.get(auth_header, None)

1103 if self.retried > 5:

1104 # Don't fail endlessly - if we failed once, we'll probably

1105 # fail a second time. Hm. Unless the Password Manager is

1106 # prompting for the information. Crap. This isn't great

1107 # but it's better than the current 'repeat until recursion

1108 # depth exceeded' approach <wink>

1109 raise HTTPError(req.full_url, 401, "digest auth failed",

1110 headers, None)

1111 else:

1112 self.retried += 1

1113 if authreq:

1114 scheme = authreq.split()[0]

1115 if scheme.lower() == 'digest':

1116 return self.retry_http_digest_auth(req, authreq)

1117 elif scheme.lower() != 'basic':

1118 raise ValueError("AbstractDigestAuthHandler does not support"

1119 " the following scheme: '%s'" % scheme)

1120

1121 def retry_http_digest_auth(self, req, auth):

1122 token, challenge = auth.split(' ', 1)

1123 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))

1124 auth = self.get_authorization(req, chal)

1125 if auth:

1126 auth_val = 'Digest %s' % auth

1127 if req.headers.get(self.auth_header, None) == auth_val:

1128 return None

1129 req.add_unredirected_header(self.auth_header, auth_val)

1130 resp = self.parent.open(req, timeout=req.timeout)

1131 return resp

1132

1133 def get_cnonce(self, nonce):

1134 # The cnonce-value is an opaque

1135 # quoted string value provided by the client and used by both client

1136 # and server to avoid chosen plaintext attacks, to provide mutual

1137 # authentication, and to provide some message integrity protection.

1138 # This isn't a fabulous effort, but it's probably Good Enough.

1139 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())

1140 b = s.encode("ascii") + _randombytes(8)

1141 dig = hashlib.sha1(b).hexdigest()

1142 return dig[:16]

1143

1144 def get_authorization(self, req, chal):

1145 try:

1146 realm = chal['realm']

1147 nonce = chal['nonce']

1148 qop = chal.get('qop')

1149 algorithm = chal.get('algorithm', 'MD5')

1150 # mod_digest doesn't send an opaque, even though it isn't

1151 # supposed to be optional

1152 opaque = chal.get('opaque', None)

1153 except KeyError:

1154 return None

1155

1156 H, KD = self.get_algorithm_impls(algorithm)

1157 if H is None:

1158 return None

1159

1160 user, pw = self.passwd.find_user_password(realm, req.full_url)

1161 if user is None:

1162 return None

1163

1164 # XXX not implemented yet

1165 if req.data is not None:

1166 entdig = self.get_entity_digest(req.data, chal)

1167 else:

1168 entdig = None

1169

1170 A1 = "%s:%s:%s" % (user, realm, pw)

1171 A2 = "%s:%s" % (req.get_method(),

1172 # XXX selector: what about proxies and full urls

1173 req.selector)

1174 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth`

1175 # or `auth-int` to the response back. we use `auth` to send the response back.

1176 if qop is None:

1177 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))

1178 elif 'auth' in qop.split(','):

1179 if nonce == self.last_nonce:

1180 self.nonce_count += 1

1181 else:

1182 self.nonce_count = 1

1183 self.last_nonce = nonce

1184 ncvalue = '%08x' % self.nonce_count

1185 cnonce = self.get_cnonce(nonce)

1186 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))

1187 respdig = KD(H(A1), noncebit)

1188 else:

1189 # XXX handle auth-int.

1190 raise URLError("qop '%s' is not supported." % qop)

1191

1192 # XXX should the partial digests be encoded too?

1193

1194 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \

1195 'response="%s"' % (user, realm, nonce, req.selector,

1196 respdig)

1197 if opaque:

1198 base += ', opaque="%s"' % opaque

1199 if entdig:

1200 base += ', digest="%s"' % entdig

1201 base += ', algorithm="%s"' % algorithm

1202 if qop:

1203 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)

1204 return base

1205

1206 def get_algorithm_impls(self, algorithm):

1207 # lambdas assume digest modules are imported at the top level

1208 if algorithm == 'MD5':

1209 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()

1210 elif algorithm == 'SHA':

1211 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()

1212 # XXX MD5-sess

1213 else:

1214 raise ValueError("Unsupported digest authentication "

1215 "algorithm %r" % algorithm)

1216 KD = lambda s, d: H("%s:%s" % (s, d))

1217 return H, KD

1218

1219 def get_entity_digest(self, data, chal):

1220 # XXX not implemented yet

1221 return None

1222

1223

1224class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):

1225 """An authentication protocol defined by RFC 2069

1226

1227 Digest authentication improves on basic authentication because it

1228 does not transmit passwords in the clear.

1229 """

1230

1231 auth_header = 'Authorization'

1232 handler_order = 490 # before Basic auth

1233

1234 def http_error_401(self, req, fp, code, msg, headers):

1235 host = urlparse(req.full_url)[1]

1236 retry = self.http_error_auth_reqed('www-authenticate',

1237 host, req, headers)

1238 self.reset_retry_count()

1239 return retry

1240

1241

1242class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):

1243

1244 auth_header = 'Proxy-Authorization'

1245 handler_order = 490 # before Basic auth

1246

1247 def http_error_407(self, req, fp, code, msg, headers):

1248 host = req.host

1249 retry = self.http_error_auth_reqed('proxy-authenticate',

1250 host, req, headers)

1251 self.reset_retry_count()

1252 return retry

1253

1254class AbstractHTTPHandler(BaseHandler):

1255

1256 def __init__(self, debuglevel=0):

1257 self._debuglevel = debuglevel

1258

1259 def set_http_debuglevel(self, level):

1260 self._debuglevel = level

1261

1262 def _get_content_length(self, request):

1263 return http.client.HTTPConnection._get_content_length(

1264 request.data,

1265 request.get_method())

1266

1267 def do_request_(self, request):

1268 host = request.host

1269 if not host:

1270 raise URLError('no host given')

1271

1272 if request.data is not None: # POST

1273 data = request.data

1274 if isinstance(data, str):

1275 msg = "POST data should be bytes, an iterable of bytes, " \

1276 "or a file object. It cannot be of type str."

1277 raise TypeError(msg)

1278 if not request.has_header('Content-type'):

1279 request.add_unredirected_header(

1280 'Content-type',

1281 'application/x-www-form-urlencoded')

1282 if (not request.has_header('Content-length')

1283 and not request.has_header('Transfer-encoding')):

1284 content_length = self._get_content_length(request)

1285 if content_length is not None:

1286 request.add_unredirected_header(

1287 'Content-length', str(content_length))

1288 else:

1289 request.add_unredirected_header(

1290 'Transfer-encoding', 'chunked')

1291

1292 sel_host = host

1293 if request.has_proxy():

1294 scheme, sel = _splittype(request.selector)

1295 sel_host, sel_path = _splithost(sel)

1296 if not request.has_header('Host'):

1297 request.add_unredirected_header('Host', sel_host)

1298 for name, value in self.parent.addheaders:

1299 name = name.capitalize()

1300 if not request.has_header(name):

1301 request.add_unredirected_header(name, value)

1302

1303 return request

1304

1305 def do_open(self, http_class, req, **http_conn_args):

1306 """Return an HTTPResponse object for the request, using http_class.

1307

1308 http_class must implement the HTTPConnection API from http.client.

1309 """

1310 host = req.host

1311 if not host:

1312 raise URLError('no host given')

1313

1314 # will parse host:port

1315 h = http_class(host, timeout=req.timeout, **http_conn_args)

1316 h.set_debuglevel(self._debuglevel)

1317

1318 headers = dict(req.unredirected_hdrs)

1319 headers.update({k: v for k, v in req.headers.items()

1320 if k not in headers})

1321

1322 # TODO(jhylton): Should this be redesigned to handle

1323 # persistent connections?

1324

1325 # We want to make an HTTP/1.1 request, but the addinfourl

1326 # class isn't prepared to deal with a persistent connection.

1327 # It will try to read all remaining data from the socket,

1328 # which will block while the server waits for the next request.

1329 # So make sure the connection gets closed after the (only)

1330 # request.

1331 headers["Connection"] = "close"

1332 headers = {name.title(): val for name, val in headers.items()}

1333

1334 if req._tunnel_host:

1335 tunnel_headers = {}

1336 proxy_auth_hdr = "Proxy-Authorization"

1337 if proxy_auth_hdr in headers:

1338 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]

1339 # Proxy-Authorization should not be sent to origin

1340 # server.

1341 del headers[proxy_auth_hdr]

1342 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)

1343

1344 try:

1345 try:

1346 h.request(req.get_method(), req.selector, req.data, headers,

1347 encode_chunked=req.has_header('Transfer-encoding'))

1348 except OSError as err: # timeout error

1349 raise URLError(err)

1350 r = h.getresponse()

1351 except:

1352 h.close()

1353 raise

1354

1355 # If the server does not send us a 'Connection: close' header,

1356 # HTTPConnection assumes the socket should be left open. Manually

1357 # mark the socket to be closed when this response object goes away.

1358 if h.sock:

1359 h.sock.close()

1360 h.sock = None

1361

1362 r.url = req.get_full_url()

1363 # This line replaces the .msg attribute of the HTTPResponse

1364 # with .headers, because urllib clients expect the response to

1365 # have the reason in .msg. It would be good to mark this

1366 # attribute is deprecated and get then to use info() or

1367 # .headers.

1368 r.msg = r.reason

1369 return r

1370

1371

1372class HTTPHandler(AbstractHTTPHandler):

1373

1374 def http_open(self, req):

1375 return self.do_open(http.client.HTTPConnection, req)

1376

1377 http_request = AbstractHTTPHandler.do_request_

1378

1379if hasattr(http.client, 'HTTPSConnection'):

1380

1381 class HTTPSHandler(AbstractHTTPHandler):

1382

1383 def __init__(self, debuglevel=0, context=None, check_hostname=None):

1384 AbstractHTTPHandler.__init__(self, debuglevel)

1385 self._context = context

1386 self._check_hostname = check_hostname

1387

1388 def https_open(self, req):

1389 return self.do_open(http.client.HTTPSConnection, req,

1390 context=self._context, check_hostname=self._check_hostname)

1391

1392 https_request = AbstractHTTPHandler.do_request_

1393

1394 __all__.append('HTTPSHandler')

1395

1396class HTTPCookieProcessor(BaseHandler):

1397 def __init__(self, cookiejar=None):

1398 import http.cookiejar

1399 if cookiejar is None:

1400 cookiejar = http.cookiejar.CookieJar()

1401 self.cookiejar = cookiejar

1402

1403 def http_request(self, request):

1404 self.cookiejar.add_cookie_header(request)

1405 return request

1406

1407 def http_response(self, request, response):

1408 self.cookiejar.extract_cookies(response, request)

1409 return response

1410

1411 https_request = http_request

1412 https_response = http_response

1413

1414class UnknownHandler(BaseHandler):

1415 def unknown_open(self, req):

1416 type = req.type

1417 raise URLError('unknown url type: %s' % type)

1418

1419def parse_keqv_list(l):

1420 """Parse list of key=value strings where keys are not duplicated."""

1421 parsed = {}

1422 for elt in l:

1423 k, v = elt.split('=', 1)

1424 if v[0] == '"' and v[-1] == '"':

1425 v = v[1:-1]

1426 parsed[k] = v

1427 return parsed

1428

1429def parse_http_list(s):

1430 """Parse lists as described by RFC 2068 Section 2.

1431

1432 In particular, parse comma-separated lists where the elements of

1433 the list may include quoted-strings. A quoted-string could

1434 contain a comma. A non-quoted string could have quotes in the

1435 middle. Neither commas nor quotes count if they are escaped.

1436 Only double-quotes count, not single-quotes.

1437 """

1438 res = []

1439 part = ''

1440

1441 escape = quote = False

1442 for cur in s:

1443 if escape:

1444 part += cur

1445 escape = False

1446 continue

1447 if quote:

1448 if cur == '\\':

1449 escape = True

1450 continue

1451 elif cur == '"':

1452 quote = False

1453 part += cur

1454 continue

1455

1456 if cur == ',':

1457 res.append(part)

1458 part = ''

1459 continue

1460

1461 if cur == '"':

1462 quote = True

1463

1464 part += cur

1465

1466 # append last part

1467 if part:

1468 res.append(part)

1469

1470 return [part.strip() for part in res]

1471

1472class FileHandler(BaseHandler):

1473 # Use local file or FTP depending on form of URL

1474 def file_open(self, req):

1475 url = req.selector

1476 if url[:2] == '//' and url[2:3] != '/' and (req.host and

1477 req.host != 'localhost'):

1478 if not req.host in self.get_names():

1479 raise URLError("file:// scheme is supported only on localhost")

1480 else:

1481 return self.open_local_file(req)

1482

1483 # names for the localhost

1484 names = None

1485 def get_names(self):

1486 if FileHandler.names is None:

1487 try:

1488 FileHandler.names = tuple(

1489 socket.gethostbyname_ex('localhost')[2] +

1490 socket.gethostbyname_ex(socket.gethostname())[2])

1491 except socket.gaierror:

1492 FileHandler.names = (socket.gethostbyname('localhost'),)

1493 return FileHandler.names

1494

1495 # not entirely sure what the rules are here

1496 def open_local_file(self, req):

1497 import email.utils

1498 import mimetypes

1499 host = req.host

1500 filename = req.selector

1501 localfile = url2pathname(filename)

1502 try:

1503 stats = os.stat(localfile)

1504 size = stats.st_size

1505 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)

1506 mtype = mimetypes.guess_type(filename)[0]

1507 headers = email.message_from_string(

1508 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %

1509 (mtype or 'text/plain', size, modified))

1510 if host:

1511 host, port = _splitport(host)

1512 if not host or \

1513 (not port and _safe_gethostbyname(host) in self.get_names()):

1514 if host:

1515 origurl = 'file://' + host + filename

1516 else:

1517 origurl = 'file://' + filename

1518 return addinfourl(open(localfile, 'rb'), headers, origurl)

1519 except OSError as exp:

1520 raise URLError(exp)

1521 raise URLError('file not on local host')

1522

1523def _safe_gethostbyname(host):

1524 try:

1525 return socket.gethostbyname(host)

1526 except socket.gaierror:

1527 return None

1528

1529class FTPHandler(BaseHandler):

1530 def ftp_open(self, req):

1531 import ftplib

1532 import mimetypes

1533 host = req.host

1534 if not host:

1535 raise URLError('ftp error: no host given')

1536 host, port = _splitport(host)

1537 if port is None:

1538 port = ftplib.FTP_PORT

1539 else:

1540 port = int(port)

1541

1542 # username/password handling

1543 user, host = _splituser(host)

1544 if user:

1545 user, passwd = _splitpasswd(user)

1546 else:

1547 passwd = None

1548 host = unquote(host)

1549 user = user or ''

1550 passwd = passwd or ''

1551

1552 try:

1553 host = socket.gethostbyname(host)

1554 except OSError as msg:

1555 raise URLError(msg)

1556 path, attrs = _splitattr(req.selector)

1557 dirs = path.split('/')

1558 dirs = list(map(unquote, dirs))

1559 dirs, file = dirs[:-1], dirs[-1]

1560 if dirs and not dirs[0]:

1561 dirs = dirs[1:]

1562 try:

1563 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)

1564 type = file and 'I' or 'D'

1565 for attr in attrs:

1566 attr, value = _splitvalue(attr)

1567 if attr.lower() == 'type' and \

1568 value in ('a', 'A', 'i', 'I', 'd', 'D'):

1569 type = value.upper()

1570 fp, retrlen = fw.retrfile(file, type)

1571 headers = ""

1572 mtype = mimetypes.guess_type(req.full_url)[0]

1573 if mtype:

1574 headers += "Content-type: %s\n" % mtype

1575 if retrlen is not None and retrlen >= 0:

1576 headers += "Content-length: %d\n" % retrlen

1577 headers = email.message_from_string(headers)

1578 return addinfourl(fp, headers, req.full_url)

1579 except ftplib.all_errors as exp:

1580 exc = URLError('ftp error: %r' % exp)

1581 raise exc.with_traceback(sys.exc_info()[2])

1582

1583 def connect_ftp(self, user, passwd, host, port, dirs, timeout):

1584 return ftpwrapper(user, passwd, host, port, dirs, timeout,

1585 persistent=False)

1586

1587class CacheFTPHandler(FTPHandler):

1588 # XXX would be nice to have pluggable cache strategies

1589 # XXX this stuff is definitely not thread safe

1590 def __init__(self):

1591 self.cache = {}

1592 self.timeout = {}

1593 self.soonest = 0

1594 self.delay = 60

1595 self.max_conns = 16

1596

1597 def setTimeout(self, t):

1598 self.delay = t

1599

1600 def setMaxConns(self, m):

1601 self.max_conns = m

1602

1603 def connect_ftp(self, user, passwd, host, port, dirs, timeout):

1604 key = user, host, port, '/'.join(dirs), timeout

1605 if key in self.cache:

1606 self.timeout[key] = time.time() + self.delay

1607 else:

1608 self.cache[key] = ftpwrapper(user, passwd, host, port,

1609 dirs, timeout)

1610 self.timeout[key] = time.time() + self.delay

1611 self.check_cache()

1612 return self.cache[key]

1613

1614 def check_cache(self):

1615 # first check for old ones

1616 t = time.time()

1617 if self.soonest <= t:

1618 for k, v in list(self.timeout.items()):

1619 if v < t:

1620 self.cache[k].close()

1621 del self.cache[k]

1622 del self.timeout[k]

1623 self.soonest = min(list(self.timeout.values()))

1624

1625 # then check the size

1626 if len(self.cache) == self.max_conns:

1627 for k, v in list(self.timeout.items()):

1628 if v == self.soonest:

1629 del self.cache[k]

1630 del self.timeout[k]

1631 break

1632 self.soonest = min(list(self.timeout.values()))

1633

1634 def clear_cache(self):

1635 for conn in self.cache.values():

1636 conn.close()

1637 self.cache.clear()

1638 self.timeout.clear()

1639

1640class DataHandler(BaseHandler):

1641 def data_open(self, req):

1642 # data URLs as specified in RFC 2397.

1643 #

1644 # ignores POSTed data

1645 #

1646 # syntax:

1647 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data

1648 # mediatype := [ type "/" subtype ] *( ";" parameter )

1649 # data := *urlchar

1650 # parameter := attribute "=" value

1651 url = req.full_url

1652

1653 scheme, data = url.split(":",1)

1654 mediatype, data = data.split(",",1)

1655

1656 # even base64 encoded data URLs might be quoted so unquote in any case:

1657 data = unquote_to_bytes(data)

1658 if mediatype.endswith(";base64"):

1659 data = base64.decodebytes(data)

1660 mediatype = mediatype[:-7]

1661

1662 if not mediatype:

1663 mediatype = "text/plain;charset=US-ASCII"

1664

1665 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %

1666 (mediatype, len(data)))

1667

1668 return addinfourl(io.BytesIO(data), headers, url)

1669

1670

1671# Code move from the old urllib module

1672

1673MAXFTPCACHE = 10 # Trim the ftp cache beyond this size

1674

1675# Helper for non-unix systems

1676if os.name == 'nt':

1677 from nturl2path import url2pathname, pathname2url

1678else:

1679 def url2pathname(pathname):

1680 """OS-specific conversion from a relative URL of the 'file' scheme

1681 to a file system path; not recommended for general use."""

1682 return unquote(pathname)

1683

1684 def pathname2url(pathname):

1685 """OS-specific conversion from a file system path to a relative URL

1686 of the 'file' scheme; not recommended for general use."""

1687 return quote(pathname)

1690ftpcache = {}

1693class URLopener:

1694 """Class to open URLs.

1695 This is a class rather than just a subroutine because we may need

1696 more than one set of global protocol-specific options.

1697 Note -- this is a base class for those who don't want the

1698 automatic handling of errors type 302 (relocated) and 401

1699 (authorization needed)."""

1700

1701 __tempfiles = None

1702

1703 version = "Python-urllib/%s" % __version__

1704

1705 # Constructor

1706 def __init__(self, proxies=None, **x509):

1707 msg = "%(class)s style of invoking requests is deprecated. " \

1708 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}

1709 warnings.warn(msg, DeprecationWarning, stacklevel=3)

1710 if proxies is None:

1711 proxies = getproxies()

1712 assert hasattr(proxies, 'keys'), "proxies must be a mapping"

1713 self.proxies = proxies

1714 self.key_file = x509.get('key_file')

1715 self.cert_file = x509.get('cert_file')

1716 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]

1717 self.__tempfiles = []

1718 self.__unlink = os.unlink # See cleanup()

1719 self.tempcache = None

1720 # Undocumented feature: if you assign {} to tempcache,

1721 # it is used to cache files retrieved with

1722 # self.retrieve(). This is not enabled by default

1723 # since it does not work for changing documents (and I

1724 # haven't got the logic to check expiration headers

1725 # yet).

1726 self.ftpcache = ftpcache

1727 # Undocumented feature: you can use a different

1728 # ftp cache by assigning to the .ftpcache member;

1729 # in case you want logically independent URL openers

1730 # XXX This is not threadsafe. Bah.

1731

1732 def __del__(self):

1733 self.close()

1734

1735 def close(self):

1736 self.cleanup()

1737

1738 def cleanup(self):

1739 # This code sometimes runs when the rest of this module

1740 # has already been deleted, so it can't use any globals

1741 # or import anything.

1742 if self.__tempfiles:

1743 for file in self.__tempfiles:

1744 try:

1745 self.__unlink(file)

1746 except OSError:

1747 pass

1748 del self.__tempfiles[:]

1749 if self.tempcache:

1750 self.tempcache.clear()

1751

1752 def addheader(self, *args):

1753 """Add a header to be used by the HTTP interface only

1754 e.g. u.addheader('Accept', 'sound/basic')"""

1755 self.addheaders.append(args)

1756

1757 # External interface

1758 def open(self, fullurl, data=None):

1759 """Use URLopener().open(file) instead of open(file, 'r')."""

1760 fullurl = unwrap(_to_bytes(fullurl))

1761 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")

1762 if self.tempcache and fullurl in self.tempcache:

1763 filename, headers = self.tempcache[fullurl]

1764 fp = open(filename, 'rb')

1765 return addinfourl(fp, headers, fullurl)

1766 urltype, url = _splittype(fullurl)

1767 if not urltype:

1768 urltype = 'file'

1769 if urltype in self.proxies:

1770 proxy = self.proxies[urltype]

1771 urltype, proxyhost = _splittype(proxy)

1772 host, selector = _splithost(proxyhost)

1773 url = (host, fullurl) # Signal special case to open_*()

1774 else:

1775 proxy = None

1776 name = 'open_' + urltype

1777 self.type = urltype

1778 name = name.replace('-', '_')

1779 if not hasattr(self, name) or name == 'open_local_file':

1780 if proxy:

1781 return self.open_unknown_proxy(proxy, fullurl, data)

1782 else:

1783 return self.open_unknown(fullurl, data)

1784 try:

1785 if data is None:

1786 return getattr(self, name)(url)

1787 else:

1788 return getattr(self, name)(url, data)

1789 except (HTTPError, URLError):

1790 raise

1791 except OSError as msg:

1792 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])

1793

1794 def open_unknown(self, fullurl, data=None):

1795 """Overridable interface to open unknown URL type."""

1796 type, url = _splittype(fullurl)

1797 raise OSError('url error', 'unknown url type', type)

1798

1799 def open_unknown_proxy(self, proxy, fullurl, data=None):

1800 """Overridable interface to open unknown URL type."""

1801 type, url = _splittype(fullurl)

1802 raise OSError('url error', 'invalid proxy for %s' % type, proxy)

1803

1804 # External interface

1805 def retrieve(self, url, filename=None, reporthook=None, data=None):

1806 """retrieve(url) returns (filename, headers) for a local object

1807 or (tempfilename, headers) for a remote object."""

1808 url = unwrap(_to_bytes(url))

1809 if self.tempcache and url in self.tempcache:

1810 return self.tempcache[url]

1811 type, url1 = _splittype(url)

1812 if filename is None and (not type or type == 'file'):

1813 try:

1814 fp = self.open_local_file(url1)

1815 hdrs = fp.info()

1816 fp.close()

1817 return url2pathname(_splithost(url1)[1]), hdrs

1818 except OSError:

1819 pass

1820 fp = self.open(url, data)

1821 try:

1822 headers = fp.info()

1823 if filename:

1824 tfp = open(filename, 'wb')

1825 else:

1826 garbage, path = _splittype(url)

1827 garbage, path = _splithost(path or "")

1828 path, garbage = _splitquery(path or "")

1829 path, garbage = _splitattr(path or "")

1830 suffix = os.path.splitext(path)[1]

1831 (fd, filename) = tempfile.mkstemp(suffix)

1832 self.__tempfiles.append(filename)

1833 tfp = os.fdopen(fd, 'wb')

1834 try:

1835 result = filename, headers

1836 if self.tempcache is not None:

1837 self.tempcache[url] = result

1838 bs = 1024*8

1839 size = -1

1840 read = 0

1841 blocknum = 0

1842 if "content-length" in headers:

1843 size = int(headers["Content-Length"])

1844 if reporthook:

1845 reporthook(blocknum, bs, size)

1846 while 1:

1847 block = fp.read(bs)

1848 if not block:

1849 break

1850 read += len(block)

1851 tfp.write(block)

1852 blocknum += 1

1853 if reporthook:

1854 reporthook(blocknum, bs, size)

1855 finally:

1856 tfp.close()

1857 finally:

1858 fp.close()

1859

1860 # raise exception if actual size does not match content-length header

1861 if size >= 0 and read < size:

1862 raise ContentTooShortError(

1863 "retrieval incomplete: got only %i out of %i bytes"

1864 % (read, size), result)

1865

1866 return result

1867

1868 # Each method named open_<type> knows how to open that type of URL

1869

1870 def _open_generic_http(self, connection_factory, url, data):

1871 """Make an HTTP connection using connection_class.

1872

1873 This is an internal method that should be called from

1874 open_http() or open_https().

1875

1876 Arguments:

1877 - connection_factory should take a host name and return an

1878 HTTPConnection instance.

1879 - url is the url to retrieval or a host, relative-path pair.

1880 - data is payload for a POST request or None.

1881 """

1882

1883 user_passwd = None

1884 proxy_passwd= None

1885 if isinstance(url, str):

1886 host, selector = _splithost(url)

1887 if host:

1888 user_passwd, host = _splituser(host)

1889 host = unquote(host)

1890 realhost = host

1891 else:

1892 host, selector = url

1893 # check whether the proxy contains authorization information

1894 proxy_passwd, host = _splituser(host)

1895 # now we proceed with the url we want to obtain

1896 urltype, rest = _splittype(selector)

1897 url = rest

1898 user_passwd = None

1899 if urltype.lower() != 'http':

1900 realhost = None

1901 else:

1902 realhost, rest = _splithost(rest)

1903 if realhost:

1904 user_passwd, realhost = _splituser(realhost)

1905 if user_passwd:

1906 selector = "%s://%s%s" % (urltype, realhost, rest)

1907 if proxy_bypass(realhost):

1908 host = realhost

1909

1910 if not host: raise OSError('http error', 'no host given')

1911

1912 if proxy_passwd:

1913 proxy_passwd = unquote(proxy_passwd)

1914 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')

1915 else:

1916 proxy_auth = None

1917

1918 if user_passwd:

1919 user_passwd = unquote(user_passwd)

1920 auth = base64.b64encode(user_passwd.encode()).decode('ascii')

1921 else:

1922 auth = None

1923 http_conn = connection_factory(host)

1924 headers = {}

1925 if proxy_auth:

1926 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth

1927 if auth:

1928 headers["Authorization"] = "Basic %s" % auth

1929 if realhost:

1930 headers["Host"] = realhost

1931

1932 # Add Connection:close as we don't support persistent connections yet.

1933 # This helps in closing the socket and avoiding ResourceWarning

1934

1935 headers["Connection"] = "close"

1936

1937 for header, value in self.addheaders:

1938 headers[header] = value

1939

1940 if data is not None:

1941 headers["Content-Type"] = "application/x-www-form-urlencoded"

1942 http_conn.request("POST", selector, data, headers)

1943 else:

1944 http_conn.request("GET", selector, headers=headers)

1945

1946 try:

1947 response = http_conn.getresponse()

1948 except http.client.BadStatusLine:

1949 # something went wrong with the HTTP status line

1950 raise URLError("http protocol error: bad status line")

1951

1952 # According to RFC 2616, "2xx" code indicates that the client's

1953 # request was successfully received, understood, and accepted.

1954 if 200 <= response.status < 300:

1955 return addinfourl(response, response.msg, "http:" + url,

1956 response.status)

1957 else:

1958 return self.http_error(

1959 url, response.fp,

1960 response.status, response.reason, response.msg, data)

1961

1962 def open_http(self, url, data=None):

1963 """Use HTTP protocol."""

1964 return self._open_generic_http(http.client.HTTPConnection, url, data)

1965

1966 def http_error(self, url, fp, errcode, errmsg, headers, data=None):

1967 """Handle http errors.

1968

1969 Derived class can override this, or provide specific handlers

1970 named http_error_DDD where DDD is the 3-digit error code."""

1971 # First check if there's a specific handler for this error

1972 name = 'http_error_%d' % errcode

1973 if hasattr(self, name):

1974 method = getattr(self, name)

1975 if data is None:

1976 result = method(url, fp, errcode, errmsg, headers)

1977 else:

1978 result = method(url, fp, errcode, errmsg, headers, data)

1979 if result: return result

1980 return self.http_error_default(url, fp, errcode, errmsg, headers)

1981

1982 def http_error_default(self, url, fp, errcode, errmsg, headers):

1983 """Default error handler: close the connection and raise OSError."""

1984 fp.close()

1985 raise HTTPError(url, errcode, errmsg, headers, None)

1986

1987 if _have_ssl:

1988 def _https_connection(self, host):

1989 return http.client.HTTPSConnection(host,

1990 key_file=self.key_file,

1991 cert_file=self.cert_file)

1992

1993 def open_https(self, url, data=None):

1994 """Use HTTPS protocol."""

1995 return self._open_generic_http(self._https_connection, url, data)

1996

1997 def open_file(self, url):

1998 """Use local file or FTP depending on form of URL."""

1999 if not isinstance(url, str):

2000 raise URLError('file error: proxy support for file protocol currently not implemented')

2001 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':

2002 raise ValueError("file:// scheme is supported only on localhost")

2003 else:

2004 return self.open_local_file(url)

2005

2006 def open_local_file(self, url):

2007 """Use local file."""

2008 import email.utils

2009 import mimetypes

2010 host, file = _splithost(url)

2011 localname = url2pathname(file)

2012 try:

2013 stats = os.stat(localname)

2014 except OSError as e:

2015 raise URLError(e.strerror, e.filename)

2016 size = stats.st_size

2017 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)

2018 mtype = mimetypes.guess_type(url)[0]

2019 headers = email.message_from_string(

2020 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %

2021 (mtype or 'text/plain', size, modified))

2022 if not host:

2023 urlfile = file

2024 if file[:1] == '/':

2025 urlfile = 'file://' + file

2026 return addinfourl(open(localname, 'rb'), headers, urlfile)

2027 host, port = _splitport(host)

2028 if (not port

2029 and socket.gethostbyname(host) in ((localhost(),) + thishost())):

2030 urlfile = file

2031 if file[:1] == '/':

2032 urlfile = 'file://' + file

2033 elif file[:2] == './':

2034 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)

2035 return addinfourl(open(localname, 'rb'), headers, urlfile)

2036 raise URLError('local file error: not on local host')

2037

2038 def open_ftp(self, url):

2039 """Use FTP protocol."""

2040 if not isinstance(url, str):

2041 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')

2042 import mimetypes

2043 host, path = _splithost(url)

2044 if not host: raise URLError('ftp error: no host given')

2045 host, port = _splitport(host)

2046 user, host = _splituser(host)

2047 if user: user, passwd = _splitpasswd(user)

2048 else: passwd = None

2049 host = unquote(host)

2050 user = unquote(user or '')

2051 passwd = unquote(passwd or '')

2052 host = socket.gethostbyname(host)

2053 if not port:

2054 import ftplib

2055 port = ftplib.FTP_PORT

2056 else:

2057 port = int(port)

2058 path, attrs = _splitattr(path)

2059 path = unquote(path)

2060 dirs = path.split('/')

2061 dirs, file = dirs[:-1], dirs[-1]

2062 if dirs and not dirs[0]: dirs = dirs[1:]

2063 if dirs and not dirs[0]: dirs[0] = '/'

2064 key = user, host, port, '/'.join(dirs)

2065 # XXX thread unsafe!

2066 if len(self.ftpcache) > MAXFTPCACHE:

2067 # Prune the cache, rather arbitrarily

2068 for k in list(self.ftpcache):

2069 if k != key:

2070 v = self.ftpcache[k]

2071 del self.ftpcache[k]

2072 v.close()

2073 try:

2074 if key not in self.ftpcache:

2075 self.ftpcache[key] = \

2076 ftpwrapper(user, passwd, host, port, dirs)

2077 if not file: type = 'D'

2078 else: type = 'I'

2079 for attr in attrs:

2080 attr, value = _splitvalue(attr)

2081 if attr.lower() == 'type' and \

2082 value in ('a', 'A', 'i', 'I', 'd', 'D'):

2083 type = value.upper()

2084 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)

2085 mtype = mimetypes.guess_type("ftp:" + url)[0]

2086 headers = ""

2087 if mtype:

2088 headers += "Content-Type: %s\n" % mtype

2089 if retrlen is not None and retrlen >= 0:

2090 headers += "Content-Length: %d\n" % retrlen

2091 headers = email.message_from_string(headers)

2092 return addinfourl(fp, headers, "ftp:" + url)

2093 except ftperrors() as exp:

2094 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])

2095

2096 def open_data(self, url, data=None):

2097 """Use "data" URL."""

2098 if not isinstance(url, str):

2099 raise URLError('data error: proxy support for data protocol currently not implemented')

2100 # ignore POSTed data

2101 #

2102 # syntax of data URLs:

2103 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data

2104 # mediatype := [ type "/" subtype ] *( ";" parameter )

2105 # data := *urlchar

2106 # parameter := attribute "=" value

2107 try:

2108 [type, data] = url.split(',', 1)

2109 except ValueError:

2110 raise OSError('data error', 'bad data URL')

2111 if not type:

2112 type = 'text/plain;charset=US-ASCII'

2113 semi = type.rfind(';')

2114 if semi >= 0 and '=' not in type[semi:]:

2115 encoding = type[semi+1:]

2116 type = type[:semi]

2117 else:

2118 encoding = ''

2119 msg = []

2120 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',

2121 time.gmtime(time.time())))

2122 msg.append('Content-type: %s' % type)

2123 if encoding == 'base64':

2124 # XXX is this encoding/decoding ok?

2125 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')

2126 else:

2127 data = unquote(data)

2128 msg.append('Content-Length: %d' % len(data))

2129 msg.append('')

2130 msg.append(data)

2131 msg = '\n'.join(msg)

2132 headers = email.message_from_string(msg)

2133 f = io.StringIO(msg)

2134 #f.fileno = None # needed for addinfourl

2135 return addinfourl(f, headers, url)

2136

2137

2138class FancyURLopener(URLopener):

2139 """Derived class with handlers for errors we can handle (perhaps)."""

2140

2141 def __init__(self, *args, **kwargs):

2142 URLopener.__init__(self, *args, **kwargs)

2143 self.auth_cache = {}

2144 self.tries = 0

2145 self.maxtries = 10

2146

2147 def http_error_default(self, url, fp, errcode, errmsg, headers):

2148 """Default error handling -- don't raise an exception."""

2149 return addinfourl(fp, headers, "http:" + url, errcode)

2150

2151 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):

2152 """Error 302 -- relocated (temporarily)."""

2153 self.tries += 1

2154 try:

2155 if self.maxtries and self.tries >= self.maxtries:

2156 if hasattr(self, "http_error_500"):

2157 meth = self.http_error_500

2158 else:

2159 meth = self.http_error_default

2160 return meth(url, fp, 500,

2161 "Internal Server Error: Redirect Recursion",

2162 headers)

2163 result = self.redirect_internal(url, fp, errcode, errmsg,

2164 headers, data)

2165 return result

2166 finally:

2167 self.tries = 0

2168

2169 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):

2170 if 'location' in headers:

2171 newurl = headers['location']

2172 elif 'uri' in headers:

2173 newurl = headers['uri']

2174 else:

2175 return

2176 fp.close()

2177

2178 # In case the server sent a relative URL, join with original:

2179 newurl = urljoin(self.type + ":" + url, newurl)

2180

2181 urlparts = urlparse(newurl)

2182

2183 # For security reasons, we don't allow redirection to anything other

2184 # than http, https and ftp.

2185

2186 # We are using newer HTTPError with older redirect_internal method

2187 # This older method will get deprecated in 3.3

2188

2189 if urlparts.scheme not in ('http', 'https', 'ftp', ''):

2190 raise HTTPError(newurl, errcode,

2191 errmsg +

2192 " Redirection to url '%s' is not allowed." % newurl,

2193 headers, fp)

2194

2195 return self.open(newurl)

2196

2197 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):

2198 """Error 301 -- also relocated (permanently)."""

2199 return self.http_error_302(url, fp, errcode, errmsg, headers, data)

2200

2201 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):

2202 """Error 303 -- also relocated (essentially identical to 302)."""

2203 return self.http_error_302(url, fp, errcode, errmsg, headers, data)

2204

2205 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):

2206 """Error 307 -- relocated, but turn POST into error."""

2207 if data is None:

2208 return self.http_error_302(url, fp, errcode, errmsg, headers, data)

2209 else:

2210 return self.http_error_default(url, fp, errcode, errmsg, headers)

2211

2212 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,

2213 retry=False):

2214 """Error 401 -- authentication required.

2215 This function supports Basic authentication only."""

2216 if 'www-authenticate' not in headers:

2217 URLopener.http_error_default(self, url, fp,

2218 errcode, errmsg, headers)

2219 stuff = headers['www-authenticate']

2220 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)

2221 if not match:

2222 URLopener.http_error_default(self, url, fp,

2223 errcode, errmsg, headers)

2224 scheme, realm = match.groups()

2225 if scheme.lower() != 'basic':

2226 URLopener.http_error_default(self, url, fp,

2227 errcode, errmsg, headers)

2228 if not retry:

2229 URLopener.http_error_default(self, url, fp, errcode, errmsg,

2230 headers)

2231 name = 'retry_' + self.type + '_basic_auth'

2232 if data is None:

2233 return getattr(self,name)(url, realm)

2234 else:

2235 return getattr(self,name)(url, realm, data)

2236

2237 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,

2238 retry=False):

2239 """Error 407 -- proxy authentication required.

2240 This function supports Basic authentication only."""

2241 if 'proxy-authenticate' not in headers:

2242 URLopener.http_error_default(self, url, fp,

2243 errcode, errmsg, headers)

2244 stuff = headers['proxy-authenticate']

2245 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)

2246 if not match:

2247 URLopener.http_error_default(self, url, fp,

2248 errcode, errmsg, headers)

2249 scheme, realm = match.groups()

2250 if scheme.lower() != 'basic':

2251 URLopener.http_error_default(self, url, fp,

2252 errcode, errmsg, headers)

2253 if not retry:

2254 URLopener.http_error_default(self, url, fp, errcode, errmsg,

2255 headers)

2256 name = 'retry_proxy_' + self.type + '_basic_auth'

2257 if data is None:

2258 return getattr(self,name)(url, realm)

2259 else:

2260 return getattr(self,name)(url, realm, data)

2261

2262 def retry_proxy_http_basic_auth(self, url, realm, data=None):

2263 host, selector = _splithost(url)

2264 newurl = 'http://' + host + selector

2265 proxy = self.proxies['http']

2266 urltype, proxyhost = _splittype(proxy)

2267 proxyhost, proxyselector = _splithost(proxyhost)

2268 i = proxyhost.find('@') + 1

2269 proxyhost = proxyhost[i:]

2270 user, passwd = self.get_user_passwd(proxyhost, realm, i)

2271 if not (user or passwd): return None

2272 proxyhost = "%s:%s@%s" % (quote(user, safe=''),

2273 quote(passwd, safe=''), proxyhost)

2274 self.proxies['http'] = 'http://' + proxyhost + proxyselector

2275 if data is None:

2276 return self.open(newurl)

2277 else:

2278 return self.open(newurl, data)

2279

2280 def retry_proxy_https_basic_auth(self, url, realm, data=None):

2281 host, selector = _splithost(url)

2282 newurl = 'https://' + host + selector

2283 proxy = self.proxies['https']

2284 urltype, proxyhost = _splittype(proxy)

2285 proxyhost, proxyselector = _splithost(proxyhost)

2286 i = proxyhost.find('@') + 1

2287 proxyhost = proxyhost[i:]

2288 user, passwd = self.get_user_passwd(proxyhost, realm, i)

2289 if not (user or passwd): return None

2290 proxyhost = "%s:%s@%s" % (quote(user, safe=''),

2291 quote(passwd, safe=''), proxyhost)

2292 self.proxies['https'] = 'https://' + proxyhost + proxyselector

2293 if data is None:

2294 return self.open(newurl)

2295 else:

2296 return self.open(newurl, data)

2297

2298 def retry_http_basic_auth(self, url, realm, data=None):

2299 host, selector = _splithost(url)

2300 i = host.find('@') + 1

2301 host = host[i:]

2302 user, passwd = self.get_user_passwd(host, realm, i)

2303 if not (user or passwd): return None

2304 host = "%s:%s@%s" % (quote(user, safe=''),

2305 quote(passwd, safe=''), host)

2306 newurl = 'http://' + host + selector

2307 if data is None:

2308 return self.open(newurl)

2309 else:

2310 return self.open(newurl, data)

2311

2312 def retry_https_basic_auth(self, url, realm, data=None):

2313 host, selector = _splithost(url)

2314 i = host.find('@') + 1

2315 host = host[i:]

2316 user, passwd = self.get_user_passwd(host, realm, i)

2317 if not (user or passwd): return None

2318 host = "%s:%s@%s" % (quote(user, safe=''),

2319 quote(passwd, safe=''), host)

2320 newurl = 'https://' + host + selector

2321 if data is None:

2322 return self.open(newurl)

2323 else:

2324 return self.open(newurl, data)

2325

2326 def get_user_passwd(self, host, realm, clear_cache=0):

2327 key = realm + '@' + host.lower()

2328 if key in self.auth_cache:

2329 if clear_cache:

2330 del self.auth_cache[key]

2331 else:

2332 return self.auth_cache[key]

2333 user, passwd = self.prompt_user_passwd(host, realm)

2334 if user or passwd: self.auth_cache[key] = (user, passwd)

2335 return user, passwd

2336

2337 def prompt_user_passwd(self, host, realm):

2338 """Override this in a GUI environment!"""

2339 import getpass

2340 try:

2341 user = input("Enter username for %s at %s: " % (realm, host))

2342 passwd = getpass.getpass("Enter password for %s in %s at %s: " %

2343 (user, realm, host))

2344 return user, passwd

2345 except KeyboardInterrupt:

2346 print()

2347 return None, None

2348

2349

2350# Utility functions

2351

2352_localhost = None

2353def localhost():

2354 """Return the IP address of the magic hostname 'localhost'."""

2355 global _localhost

2356 if _localhost is None:

2357 _localhost = socket.gethostbyname('localhost')

2358 return _localhost

2359

2360_thishost = None

2361def thishost():

2362 """Return the IP addresses of the current host."""

2363 global _thishost

2364 if _thishost is None:

2365 try:

2366 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])

2367 except socket.gaierror:

2368 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])

2369 return _thishost

2370

2371_ftperrors = None

2372def ftperrors():

2373 """Return the set of errors raised by the FTP class."""

2374 global _ftperrors

2375 if _ftperrors is None:

2376 import ftplib

2377 _ftperrors = ftplib.all_errors

2378 return _ftperrors

2379

2380_noheaders = None

2381def noheaders():

2382 """Return an empty email Message object."""

2383 global _noheaders

2384 if _noheaders is None:

2385 _noheaders = email.message_from_string("")

2386 return _noheaders

2387

2388

2389# Utility classes

2390

2391class ftpwrapper:

2392 """Class used by open_ftp() for cache of open FTP connections."""

2393

2394 def __init__(self, user, passwd, host, port, dirs, timeout=None,

2395 persistent=True):

2396 self.user = user

2397 self.passwd = passwd

2398 self.host = host

2399 self.port = port

2400 self.dirs = dirs

2401 self.timeout = timeout

2402 self.refcount = 0

2403 self.keepalive = persistent

2404 try:

2405 self.init()

2406 except:

2407 self.close()

2408 raise

2409

2410 def init(self):

2411 import ftplib

2412 self.busy = 0

2413 self.ftp = ftplib.FTP()

2414 self.ftp.connect(self.host, self.port, self.timeout)

2415 self.ftp.login(self.user, self.passwd)

2416 _target = '/'.join(self.dirs)

2417 self.ftp.cwd(_target)

2418

2419 def retrfile(self, file, type):

2420 import ftplib

2421 self.endtransfer()

2422 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1

2423 else: cmd = 'TYPE ' + type; isdir = 0

2424 try:

2425 self.ftp.voidcmd(cmd)

2426 except ftplib.all_errors:

2427 self.init()

2428 self.ftp.voidcmd(cmd)

2429 conn = None

2430 if file and not isdir:

2431 # Try to retrieve as a file

2432 try:

2433 cmd = 'RETR ' + file

2434 conn, retrlen = self.ftp.ntransfercmd(cmd)

2435 except ftplib.error_perm as reason:

2436 if str(reason)[:3] != '550':

2437 raise URLError('ftp error: %r' % reason).with_traceback(

2438 sys.exc_info()[2])

2439 if not conn:

2440 # Set transfer mode to ASCII!

2441 self.ftp.voidcmd('TYPE A')

2442 # Try a directory listing. Verify that directory exists.

2443 if file:

2444 pwd = self.ftp.pwd()

2445 try:

2446 try:

2447 self.ftp.cwd(file)

2448 except ftplib.error_perm as reason:

2449 raise URLError('ftp error: %r' % reason) from reason

2450 finally:

2451 self.ftp.cwd(pwd)

2452 cmd = 'LIST ' + file

2453 else:

2454 cmd = 'LIST'

2455 conn, retrlen = self.ftp.ntransfercmd(cmd)

2456 self.busy = 1

2457

2458 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)

2459 self.refcount += 1

2460 conn.close()

2461 # Pass back both a suitably decorated object and a retrieval length

2462 return (ftpobj, retrlen)

2463

2464 def endtransfer(self):

2465 self.busy = 0

2466

2467 def close(self):

2468 self.keepalive = False

2469 if self.refcount <= 0:

2470 self.real_close()

2471

2472 def file_close(self):

2473 self.endtransfer()

2474 self.refcount -= 1

2475 if self.refcount <= 0 and not self.keepalive:

2476 self.real_close()

2477

2478 def real_close(self):

2479 self.endtransfer()

2480 try:

2481 self.ftp.close()

2482 except ftperrors():

2483 pass

2484

2485# Proxy handling

2486def getproxies_environment():

2487 """Return a dictionary of scheme -> proxy server URL mappings.

2488

2489 Scan the environment for variables named <scheme>_proxy;

2490 this seems to be the standard convention. If you need a

2491 different way, you can pass a proxies dictionary to the

2492 [Fancy]URLopener constructor.

2493

2494 """

2495 proxies = {}

2496 # in order to prefer lowercase variables, process environment in

2497 # two passes: first matches any, second pass matches lowercase only

2498 for name, value in os.environ.items():

2499 name = name.lower()

2500 if value and name[-6:] == '_proxy':

2501 proxies[name[:-6]] = value

2502 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY

2503 # (non-all-lowercase) as it may be set from the web server by a "Proxy:"

2504 # header from the client

2505 # If "proxy" is lowercase, it will still be used thanks to the next block

2506 if 'REQUEST_METHOD' in os.environ:

2507 proxies.pop('http', None)

2508 for name, value in os.environ.items():

2509 if name[-6:] == '_proxy':

2510 name = name.lower()

2511 if value:

2512 proxies[name[:-6]] = value

2513 else:

2514 proxies.pop(name[:-6], None)

2515 return proxies

2516

2517def proxy_bypass_environment(host, proxies=None):

2518 """Test if proxies should not be used for a particular host.

2519

2520 Checks the proxy dict for the value of no_proxy, which should

2521 be a list of comma separated DNS suffixes, or '*' for all hosts.

2522

2523 """

2524 if proxies is None:

2525 proxies = getproxies_environment()

2526 # don't bypass, if no_proxy isn't specified

2527 try:

2528 no_proxy = proxies['no']

2529 except KeyError:

2530 return False

2531 # '*' is special case for always bypass

2532 if no_proxy == '*':

2533 return True

2534 host = host.lower()

2535 # strip port off host

2536 hostonly, port = _splitport(host)

2537 # check if the host ends with any of the DNS suffixes

2538 for name in no_proxy.split(','):

2539 name = name.strip()

2540 if name:

2541 name = name.lstrip('.') # ignore leading dots

2542 name = name.lower()

2543 if hostonly == name or host == name:

2544 return True

2545 name = '.' + name

2546 if hostonly.endswith(name) or host.endswith(name):

2547 return True

2548 # otherwise, don't bypass

2549 return False

2550

2551

2552# This code tests an OSX specific data structure but is testable on all

2553# platforms

2554def _proxy_bypass_macosx_sysconf(host, proxy_settings):

2555 """

2556 Return True iff this host shouldn't be accessed using a proxy

2557

2558 This function uses the MacOSX framework SystemConfiguration

2559 to fetch the proxy information.

2560

2561 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:

2562 { 'exclude_simple': bool,

2563 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']

2564 }

2565 """

2566 from fnmatch import fnmatch

2567

2568 hostonly, port = _splitport(host)

2569

2570 def ip2num(ipAddr):

2571 parts = ipAddr.split('.')

2572 parts = list(map(int, parts))

2573 if len(parts) != 4:

2574 parts = (parts + [0, 0, 0, 0])[:4]

2575 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]

2576

2577 # Check for simple host names:

2578 if '.' not in host:

2579 if proxy_settings['exclude_simple']:

2580 return True

2581

2582 hostIP = None

2583

2584 for value in proxy_settings.get('exceptions', ()):

2585 # Items in the list are strings like these: *.local, 169.254/16

2586 if not value: continue

2587

2588 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)

2589 if m is not None:

2590 if hostIP is None:

2591 try:

2592 hostIP = socket.gethostbyname(hostonly)

2593 hostIP = ip2num(hostIP)

2594 except OSError:

2595 continue

2596

2597 base = ip2num(m.group(1))

2598 mask = m.group(2)

2599 if mask is None:

2600 mask = 8 * (m.group(1).count('.') + 1)

2601 else:

2602 mask = int(mask[1:])

2603

2604 if mask < 0 or mask > 32:

2605 # System libraries ignore invalid prefix lengths

2606 continue

2607

2608 mask = 32 - mask

2609

2610 if (hostIP >> mask) == (base >> mask):

2611 return True

2612

2613 elif fnmatch(host, value):

2614 return True

2615

2616 return False

2617

2618

2619if sys.platform == 'darwin':

2620 from _scproxy import _get_proxy_settings, _get_proxies

2621

2622 def proxy_bypass_macosx_sysconf(host):

2623 proxy_settings = _get_proxy_settings()

2624 return _proxy_bypass_macosx_sysconf(host, proxy_settings)

2625

2626 def getproxies_macosx_sysconf():

2627 """Return a dictionary of scheme -> proxy server URL mappings.

2628

2629 This function uses the MacOSX framework SystemConfiguration

2630 to fetch the proxy information.

2631 """

2632 return _get_proxies()

2633

2634

2635

2636 def proxy_bypass(host):

2637 """Return True, if host should be bypassed.

2638

2639 Checks proxy settings gathered from the environment, if specified,

2640 or from the MacOSX framework SystemConfiguration.

2641

2642 """

2643 proxies = getproxies_environment()

2644 if proxies:

2645 return proxy_bypass_environment(host, proxies)

2646 else:

2647 return proxy_bypass_macosx_sysconf(host)

2648

2649 def getproxies():

2650 return getproxies_environment() or getproxies_macosx_sysconf()

2651

2652

2653elif os.name == 'nt':

2654 def getproxies_registry():

2655 """Return a dictionary of scheme -> proxy server URL mappings.

2656

2657 Win32 uses the registry to store proxies.

2658

2659 """

2660 proxies = {}

2661 try:

2662 import winreg

2663 except ImportError:

2664 # Std module, so should be around - but you never know!

2665 return proxies

2666 try:

2667 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,

2668 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')

2669 proxyEnable = winreg.QueryValueEx(internetSettings,

2670 'ProxyEnable')[0]

2671 if proxyEnable:

2672 # Returned as Unicode but problems if not converted to ASCII

2673 proxyServer = str(winreg.QueryValueEx(internetSettings,

2674 'ProxyServer')[0])

2675 if '=' in proxyServer:

2676 # Per-protocol settings

2677 for p in proxyServer.split(';'):

2678 protocol, address = p.split('=', 1)

2679 # See if address has a type:// prefix

2680 if not re.match('(?:[^/:]+)://', address):

2681 address = '%s://%s' % (protocol, address)

2682 proxies[protocol] = address

2683 else:

2684 # Use one setting for all protocols

2685 if proxyServer[:5] == 'http:':

2686 proxies['http'] = proxyServer

2687 else:

2688 proxies['http'] = 'http://%s' % proxyServer

2689 proxies['https'] = 'https://%s' % proxyServer

2690 proxies['ftp'] = 'ftp://%s' % proxyServer

2691 internetSettings.Close()

2692 except (OSError, ValueError, TypeError):

2693 # Either registry key not found etc, or the value in an

2694 # unexpected format.

2695 # proxies already set up to be empty so nothing to do

2696 pass

2697 return proxies

2698

2699 def getproxies():

2700 """Return a dictionary of scheme -> proxy server URL mappings.

2701

2702 Returns settings gathered from the environment, if specified,

2703 or the registry.

2704

2705 """

2706 return getproxies_environment() or getproxies_registry()

2707

2708 def proxy_bypass_registry(host):

2709 try:

2710 import winreg

2711 except ImportError:

2712 # Std modules, so should be around - but you never know!

2713 return 0

2714 try:

2715 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,

2716 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')

2717 proxyEnable = winreg.QueryValueEx(internetSettings,

2718 'ProxyEnable')[0]

2719 proxyOverride = str(winreg.QueryValueEx(internetSettings,

2720 'ProxyOverride')[0])

2721 # ^^^^ Returned as Unicode but problems if not converted to ASCII

2722 except OSError:

2723 return 0

2724 if not proxyEnable or not proxyOverride:

2725 return 0

2726 # try to make a host list from name and IP address.

2727 rawHost, port = _splitport(host)

2728 host = [rawHost]

2729 try:

2730 addr = socket.gethostbyname(rawHost)

2731 if addr != rawHost:

2732 host.append(addr)

2733 except OSError:

2734 pass

2735 try:

2736 fqdn = socket.getfqdn(rawHost)

2737 if fqdn != rawHost:

2738 host.append(fqdn)

2739 except OSError:

2740 pass

2741 # make a check value list from the registry entry: replace the

2742 # '<local>' string by the localhost entry and the corresponding

2743 # canonical entry.

2744 proxyOverride = proxyOverride.split(';')

2745 # now check if we match one of the registry values.

2746 for test in proxyOverride:

2747 if test == '<local>':

2748 if '.' not in rawHost:

2749 return 1

2750 test = test.replace(".", r"\.") # mask dots

2751 test = test.replace("*", r".*") # change glob sequence

2752 test = test.replace("?", r".") # change glob char

2753 for val in host:

2754 if re.match(test, val, re.I):

2755 return 1

2756 return 0

2757

2758 def proxy_bypass(host):

2759 """Return True, if host should be bypassed.

2760

2761 Checks proxy settings gathered from the environment, if specified,

2762 or the registry.

2763

2764 """

2765 proxies = getproxies_environment()

2766 if proxies:

2767 return proxy_bypass_environment(host, proxies)

2768 else:

2769 return proxy_bypass_registry(host)

2770

2771else:

2772 # By default use environment variables

2773 getproxies = getproxies_environment

2774 proxy_bypass = proxy_bypass_environment