1"""
2Ported using Python-Future from the Python 3.3 standard library.
3
4An extensible library for opening URLs using a variety of protocols
5
6The simplest way to use this module is to call the urlopen function,
7which accepts a string containing a URL or a Request object (described
8below). It opens the URL and returns the results as file-like
9object; the returned object has some extra methods described below.
10
11The OpenerDirector manages a collection of Handler objects that do
12all the actual work. Each Handler implements a particular protocol or
13option. The OpenerDirector is a composite object that invokes the
14Handlers needed to open the requested URL. For example, the
15HTTPHandler performs HTTP GET and POST requests and deals with
16non-error returns. The HTTPRedirectHandler automatically deals with
17HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
18deals with digest authentication.
19
20urlopen(url, data=None) -- Basic usage is the same as original
21urllib. pass the url and optionally data to post to an HTTP URL, and
22get a file-like object back. One difference is that you can also pass
23a Request instance instead of URL. Raises a URLError (subclass of
24IOError); for HTTP errors, raises an HTTPError, which can also be
25treated as a valid response.
26
27build_opener -- Function that creates a new OpenerDirector instance.
28Will install the default handlers. Accepts one or more Handlers as
29arguments, either instances or Handler classes that it will
30instantiate. If one of the argument is a subclass of the default
31handler, the argument will be installed instead of the default.
32
33install_opener -- Installs a new opener as the default opener.
34
35objects of interest:
36
37OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
38the Handler classes, while dealing with requests and responses.
39
40Request -- An object that encapsulates the state of a request. The
41state can be as simple as the URL. It can also include extra HTTP
42headers, e.g. a User-Agent.
43
44BaseHandler --
45
46internals:
47BaseHandler and parent
48_call_chain conventions
49
50Example usage:
51
52import urllib.request
53
54# set up authentication info
55authinfo = urllib.request.HTTPBasicAuthHandler()
56authinfo.add_password(realm='PDQ Application',
57 uri='https://mahler:8092/site-updates.py',
58 user='klem',
59 passwd='geheim$parole')
60
61proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
62
63# build a new opener that adds authentication and caching FTP handlers
64opener = urllib.request.build_opener(proxy_support, authinfo,
65 urllib.request.CacheFTPHandler)
66
67# install it
68urllib.request.install_opener(opener)
69
70f = urllib.request.urlopen('http://www.python.org/')
71"""
72
73# XXX issues:
74# If an authentication error handler that tries to perform
75# authentication for some reason but fails, how should the error be
76# signalled? The client needs to know the HTTP error code. But if
77# the handler knows that the problem was, e.g., that it didn't know
78# that hash algo that requested in the challenge, it would be good to
79# pass that information along to the client, too.
80# ftp errors aren't handled cleanly
81# check digest against correct (i.e. non-apache) implementation
82
83# Possible extensions:
84# complex proxies XXX not sure what exactly was meant by this
85# abstract factory for opener
86
87from __future__ import absolute_import, division, print_function, unicode_literals
88from future.builtins import bytes, dict, filter, input, int, map, open, str
89from future.utils import PY2, PY3, raise_with_traceback
90
91import base64
92import bisect
93import hashlib
94import array
95
96from future.backports import email
97from future.backports.http import client as http_client
98from .error import URLError, HTTPError, ContentTooShortError
99from .parse import (
100 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
101 splittype, splithost, splitport, splituser, splitpasswd,
102 splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
103from .response import addinfourl, addclosehook
104
105import io
106import os
107import posixpath
108import re
109import socket
110import sys
111import time
112import tempfile
113import contextlib
114import warnings
115
116from future.utils import PY2
117
118if PY2:
119 from collections import Iterable
120else:
121 from collections.abc import Iterable
122
123# check for SSL
124try:
125 import ssl
126 # Not available in the SSL module in Py2:
127 from ssl import SSLContext
128except ImportError:
129 _have_ssl = False
130else:
131 _have_ssl = True
132
133__all__ = [
134 # Classes
135 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
136 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
137 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
138 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
139 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
140 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
141 'UnknownHandler', 'HTTPErrorProcessor',
142 # Functions
143 'urlopen', 'install_opener', 'build_opener',
144 'pathname2url', 'url2pathname', 'getproxies',
145 # Legacy interface
146 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
147]
148
149# used in User-Agent header sent
150__version__ = sys.version[:3]
151
152_opener = None
153def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, **_3to2kwargs):
154 if 'cadefault' in _3to2kwargs: cadefault = _3to2kwargs['cadefault']; del _3to2kwargs['cadefault']
155 else: cadefault = False
156 if 'capath' in _3to2kwargs: capath = _3to2kwargs['capath']; del _3to2kwargs['capath']
157 else: capath = None
158 if 'cafile' in _3to2kwargs: cafile = _3to2kwargs['cafile']; del _3to2kwargs['cafile']
159 else: cafile = None
160 global _opener
161 if cafile or capath or cadefault:
162 if not _have_ssl:
163 raise ValueError('SSL support not available')
164 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
165 context.options |= ssl.OP_NO_SSLv2
166 context.verify_mode = ssl.CERT_REQUIRED
167 if cafile or capath:
168 context.load_verify_locations(cafile, capath)
169 else:
170 context.set_default_verify_paths()
171 https_handler = HTTPSHandler(context=context, check_hostname=True)
172 opener = build_opener(https_handler)
173 elif _opener is None:
174 _opener = opener = build_opener()
175 else:
176 opener = _opener
177 return opener.open(url, data, timeout)
178
179def install_opener(opener):
180 global _opener
181 _opener = opener
182
183_url_tempfiles = []
184def urlretrieve(url, filename=None, reporthook=None, data=None):
185 """
186 Retrieve a URL into a temporary location on disk.
187
188 Requires a URL argument. If a filename is passed, it is used as
189 the temporary file location. The reporthook argument should be
190 a callable that accepts a block number, a read size, and the
191 total file size of the URL target. The data argument should be
192 valid URL encoded data.
193
194 If a filename is passed and the URL points to a local resource,
195 the result is a copy from local file to new file.
196
197 Returns a tuple containing the path to the newly created
198 data file as well as the resulting HTTPMessage object.
199 """
200 url_type, path = splittype(url)
201
202 with contextlib.closing(urlopen(url, data)) as fp:
203 headers = fp.info()
204
205 # Just return the local path and the "headers" for file://
206 # URLs. No sense in performing a copy unless requested.
207 if url_type == "file" and not filename:
208 return os.path.normpath(path), headers
209
210 # Handle temporary file setup.
211 if filename:
212 tfp = open(filename, 'wb')
213 else:
214 tfp = tempfile.NamedTemporaryFile(delete=False)
215 filename = tfp.name
216 _url_tempfiles.append(filename)
217
218 with tfp:
219 result = filename, headers
220 bs = 1024*8
221 size = -1
222 read = 0
223 blocknum = 0
224 if "content-length" in headers:
225 size = int(headers["Content-Length"])
226
227 if reporthook:
228 reporthook(blocknum, bs, size)
229
230 while True:
231 block = fp.read(bs)
232 if not block:
233 break
234 read += len(block)
235 tfp.write(block)
236 blocknum += 1
237 if reporthook:
238 reporthook(blocknum, bs, size)
239
240 if size >= 0 and read < size:
241 raise ContentTooShortError(
242 "retrieval incomplete: got only %i out of %i bytes"
243 % (read, size), result)
244
245 return result
246
247def urlcleanup():
248 for temp_file in _url_tempfiles:
249 try:
250 os.unlink(temp_file)
251 except EnvironmentError:
252 pass
253
254 del _url_tempfiles[:]
255 global _opener
256 if _opener:
257 _opener = None
258
259if PY3:
260 _cut_port_re = re.compile(r":\d+$", re.ASCII)
261else:
262 _cut_port_re = re.compile(r":\d+$")
263
264def request_host(request):
265
266 """Return request-host, as defined by RFC 2965.
267
268 Variation from RFC: returned value is lowercased, for convenient
269 comparison.
270
271 """
272 url = request.full_url
273 host = urlparse(url)[1]
274 if host == "":
275 host = request.get_header("Host", "")
276
277 # remove port, if present
278 host = _cut_port_re.sub("", host, 1)
279 return host.lower()
280
281class Request(object):
282
283 def __init__(self, url, data=None, headers={},
284 origin_req_host=None, unverifiable=False,
285 method=None):
286 # unwrap('<URL:type://host/path>') --> 'type://host/path'
287 self.full_url = unwrap(url)
288 self.full_url, self.fragment = splittag(self.full_url)
289 self.data = data
290 self.headers = {}
291 self._tunnel_host = None
292 for key, value in headers.items():
293 self.add_header(key, value)
294 self.unredirected_hdrs = {}
295 if origin_req_host is None:
296 origin_req_host = request_host(self)
297 self.origin_req_host = origin_req_host
298 self.unverifiable = unverifiable
299 self.method = method
300 self._parse()
301
302 def _parse(self):
303 self.type, rest = splittype(self.full_url)
304 if self.type is None:
305 raise ValueError("unknown url type: %r" % self.full_url)
306 self.host, self.selector = splithost(rest)
307 if self.host:
308 self.host = unquote(self.host)
309
310 def get_method(self):
311 """Return a string indicating the HTTP request method."""
312 if self.method is not None:
313 return self.method
314 elif self.data is not None:
315 return "POST"
316 else:
317 return "GET"
318
319 def get_full_url(self):
320 if self.fragment:
321 return '%s#%s' % (self.full_url, self.fragment)
322 else:
323 return self.full_url
324
325 # Begin deprecated methods
326
327 def add_data(self, data):
328 msg = "Request.add_data method is deprecated."
329 warnings.warn(msg, DeprecationWarning, stacklevel=1)
330 self.data = data
331
332 def has_data(self):
333 msg = "Request.has_data method is deprecated."
334 warnings.warn(msg, DeprecationWarning, stacklevel=1)
335 return self.data is not None
336
337 def get_data(self):
338 msg = "Request.get_data method is deprecated."
339 warnings.warn(msg, DeprecationWarning, stacklevel=1)
340 return self.data
341
342 def get_type(self):
343 msg = "Request.get_type method is deprecated."
344 warnings.warn(msg, DeprecationWarning, stacklevel=1)
345 return self.type
346
347 def get_host(self):
348 msg = "Request.get_host method is deprecated."
349 warnings.warn(msg, DeprecationWarning, stacklevel=1)
350 return self.host
351
352 def get_selector(self):
353 msg = "Request.get_selector method is deprecated."
354 warnings.warn(msg, DeprecationWarning, stacklevel=1)
355 return self.selector
356
357 def is_unverifiable(self):
358 msg = "Request.is_unverifiable method is deprecated."
359 warnings.warn(msg, DeprecationWarning, stacklevel=1)
360 return self.unverifiable
361
362 def get_origin_req_host(self):
363 msg = "Request.get_origin_req_host method is deprecated."
364 warnings.warn(msg, DeprecationWarning, stacklevel=1)
365 return self.origin_req_host
366
367 # End deprecated methods
368
369 def set_proxy(self, host, type):
370 if self.type == 'https' and not self._tunnel_host:
371 self._tunnel_host = self.host
372 else:
373 self.type= type
374 self.selector = self.full_url
375 self.host = host
376
377 def has_proxy(self):
378 return self.selector == self.full_url
379
380 def add_header(self, key, val):
381 # useful for something like authentication
382 self.headers[key.capitalize()] = val
383
384 def add_unredirected_header(self, key, val):
385 # will not be added to a redirected request
386 self.unredirected_hdrs[key.capitalize()] = val
387
388 def has_header(self, header_name):
389 return (header_name in self.headers or
390 header_name in self.unredirected_hdrs)
391
392 def get_header(self, header_name, default=None):
393 return self.headers.get(
394 header_name,
395 self.unredirected_hdrs.get(header_name, default))
396
397 def header_items(self):
398 hdrs = self.unredirected_hdrs.copy()
399 hdrs.update(self.headers)
400 return list(hdrs.items())
401
402class OpenerDirector(object):
403 def __init__(self):
404 client_version = "Python-urllib/%s" % __version__
405 self.addheaders = [('User-agent', client_version)]
406 # self.handlers is retained only for backward compatibility
407 self.handlers = []
408 # manage the individual handlers
409 self.handle_open = {}
410 self.handle_error = {}
411 self.process_response = {}
412 self.process_request = {}
413
414 def add_handler(self, handler):
415 if not hasattr(handler, "add_parent"):
416 raise TypeError("expected BaseHandler instance, got %r" %
417 type(handler))
418
419 added = False
420 for meth in dir(handler):
421 if meth in ["redirect_request", "do_open", "proxy_open"]:
422 # oops, coincidental match
423 continue
424
425 i = meth.find("_")
426 protocol = meth[:i]
427 condition = meth[i+1:]
428
429 if condition.startswith("error"):
430 j = condition.find("_") + i + 1
431 kind = meth[j+1:]
432 try:
433 kind = int(kind)
434 except ValueError:
435 pass
436 lookup = self.handle_error.get(protocol, {})
437 self.handle_error[protocol] = lookup
438 elif condition == "open":
439 kind = protocol
440 lookup = self.handle_open
441 elif condition == "response":
442 kind = protocol
443 lookup = self.process_response
444 elif condition == "request":
445 kind = protocol
446 lookup = self.process_request
447 else:
448 continue
449
450 handlers = lookup.setdefault(kind, [])
451 if handlers:
452 bisect.insort(handlers, handler)
453 else:
454 handlers.append(handler)
455 added = True
456
457 if added:
458 bisect.insort(self.handlers, handler)
459 handler.add_parent(self)
460
461 def close(self):
462 # Only exists for backwards compatibility.
463 pass
464
465 def _call_chain(self, chain, kind, meth_name, *args):
466 # Handlers raise an exception if no one else should try to handle
467 # the request, or return None if they can't but another handler
468 # could. Otherwise, they return the response.
469 handlers = chain.get(kind, ())
470 for handler in handlers:
471 func = getattr(handler, meth_name)
472 result = func(*args)
473 if result is not None:
474 return result
475
476 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
477 """
478 Accept a URL or a Request object
479
480 Python-Future: if the URL is passed as a byte-string, decode it first.
481 """
482 if isinstance(fullurl, bytes):
483 fullurl = fullurl.decode()
484 if isinstance(fullurl, str):
485 req = Request(fullurl, data)
486 else:
487 req = fullurl
488 if data is not None:
489 req.data = data
490
491 req.timeout = timeout
492 protocol = req.type
493
494 # pre-process request
495 meth_name = protocol+"_request"
496 for processor in self.process_request.get(protocol, []):
497 meth = getattr(processor, meth_name)
498 req = meth(req)
499
500 response = self._open(req, data)
501
502 # post-process response
503 meth_name = protocol+"_response"
504 for processor in self.process_response.get(protocol, []):
505 meth = getattr(processor, meth_name)
506 response = meth(req, response)
507
508 return response
509
510 def _open(self, req, data=None):
511 result = self._call_chain(self.handle_open, 'default',
512 'default_open', req)
513 if result:
514 return result
515
516 protocol = req.type
517 result = self._call_chain(self.handle_open, protocol, protocol +
518 '_open', req)
519 if result:
520 return result
521
522 return self._call_chain(self.handle_open, 'unknown',
523 'unknown_open', req)
524
525 def error(self, proto, *args):
526 if proto in ('http', 'https'):
527 # XXX http[s] protocols are special-cased
528 dict = self.handle_error['http'] # https is not different than http
529 proto = args[2] # YUCK!
530 meth_name = 'http_error_%s' % proto
531 http_err = 1
532 orig_args = args
533 else:
534 dict = self.handle_error
535 meth_name = proto + '_error'
536 http_err = 0
537 args = (dict, proto, meth_name) + args
538 result = self._call_chain(*args)
539 if result:
540 return result
541
542 if http_err:
543 args = (dict, 'default', 'http_error_default') + orig_args
544 return self._call_chain(*args)
545
546# XXX probably also want an abstract factory that knows when it makes
547# sense to skip a superclass in favor of a subclass and when it might
548# make sense to include both
549
550def build_opener(*handlers):
551 """Create an opener object from a list of handlers.
552
553 The opener will use several default handlers, including support
554 for HTTP, FTP and when applicable HTTPS.
555
556 If any of the handlers passed as arguments are subclasses of the
557 default handlers, the default handlers will not be used.
558 """
559 def isclass(obj):
560 return isinstance(obj, type) or hasattr(obj, "__bases__")
561
562 opener = OpenerDirector()
563 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
564 HTTPDefaultErrorHandler, HTTPRedirectHandler,
565 FTPHandler, FileHandler, HTTPErrorProcessor]
566 if hasattr(http_client, "HTTPSConnection"):
567 default_classes.append(HTTPSHandler)
568 skip = set()
569 for klass in default_classes:
570 for check in handlers:
571 if isclass(check):
572 if issubclass(check, klass):
573 skip.add(klass)
574 elif isinstance(check, klass):
575 skip.add(klass)
576 for klass in skip:
577 default_classes.remove(klass)
578
579 for klass in default_classes:
580 opener.add_handler(klass())
581
582 for h in handlers:
583 if isclass(h):
584 h = h()
585 opener.add_handler(h)
586 return opener
587
588class BaseHandler(object):
589 handler_order = 500
590
591 def add_parent(self, parent):
592 self.parent = parent
593
594 def close(self):
595 # Only exists for backwards compatibility
596 pass
597
598 def __lt__(self, other):
599 if not hasattr(other, "handler_order"):
600 # Try to preserve the old behavior of having custom classes
601 # inserted after default ones (works only for custom user
602 # classes which are not aware of handler_order).
603 return True
604 return self.handler_order < other.handler_order
605
606
607class HTTPErrorProcessor(BaseHandler):
608 """Process HTTP error responses."""
609 handler_order = 1000 # after all other processing
610
611 def http_response(self, request, response):
612 code, msg, hdrs = response.code, response.msg, response.info()
613
614 # According to RFC 2616, "2xx" code indicates that the client's
615 # request was successfully received, understood, and accepted.
616 if not (200 <= code < 300):
617 response = self.parent.error(
618 'http', request, response, code, msg, hdrs)
619
620 return response
621
622 https_response = http_response
623
624class HTTPDefaultErrorHandler(BaseHandler):
625 def http_error_default(self, req, fp, code, msg, hdrs):
626 raise HTTPError(req.full_url, code, msg, hdrs, fp)
627
628class HTTPRedirectHandler(BaseHandler):
629 # maximum number of redirections to any single URL
630 # this is needed because of the state that cookies introduce
631 max_repeats = 4
632 # maximum total number of redirections (regardless of URL) before
633 # assuming we're in a loop
634 max_redirections = 10
635
636 def redirect_request(self, req, fp, code, msg, headers, newurl):
637 """Return a Request or None in response to a redirect.
638
639 This is called by the http_error_30x methods when a
640 redirection response is received. If a redirection should
641 take place, return a new Request to allow http_error_30x to
642 perform the redirect. Otherwise, raise HTTPError if no-one
643 else should try to handle this url. Return None if you can't
644 but another Handler might.
645 """
646 m = req.get_method()
647 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
648 or code in (301, 302, 303) and m == "POST")):
649 raise HTTPError(req.full_url, code, msg, headers, fp)
650
651 # Strictly (according to RFC 2616), 301 or 302 in response to
652 # a POST MUST NOT cause a redirection without confirmation
653 # from the user (of urllib.request, in this case). In practice,
654 # essentially all clients do redirect in this case, so we do
655 # the same.
656 # be conciliant with URIs containing a space
657 newurl = newurl.replace(' ', '%20')
658 CONTENT_HEADERS = ("content-length", "content-type")
659 newheaders = dict((k, v) for k, v in req.headers.items()
660 if k.lower() not in CONTENT_HEADERS)
661 return Request(newurl,
662 headers=newheaders,
663 origin_req_host=req.origin_req_host,
664 unverifiable=True)
665
666 # Implementation note: To avoid the server sending us into an
667 # infinite loop, the request object needs to track what URLs we
668 # have already seen. Do this by adding a handler-specific
669 # attribute to the Request object.
670 def http_error_302(self, req, fp, code, msg, headers):
671 # Some servers (incorrectly) return multiple Location headers
672 # (so probably same goes for URI). Use first header.
673 if "location" in headers:
674 newurl = headers["location"]
675 elif "uri" in headers:
676 newurl = headers["uri"]
677 else:
678 return
679
680 # fix a possible malformed URL
681 urlparts = urlparse(newurl)
682
683 # For security reasons we don't allow redirection to anything other
684 # than http, https or ftp.
685
686 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
687 raise HTTPError(
688 newurl, code,
689 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
690 headers, fp)
691
692 if not urlparts.path:
693 urlparts = list(urlparts)
694 urlparts[2] = "/"
695 newurl = urlunparse(urlparts)
696
697 newurl = urljoin(req.full_url, newurl)
698
699 # XXX Probably want to forget about the state of the current
700 # request, although that might interact poorly with other
701 # handlers that also use handler-specific request attributes
702 new = self.redirect_request(req, fp, code, msg, headers, newurl)
703 if new is None:
704 return
705
706 # loop detection
707 # .redirect_dict has a key url if url was previously visited.
708 if hasattr(req, 'redirect_dict'):
709 visited = new.redirect_dict = req.redirect_dict
710 if (visited.get(newurl, 0) >= self.max_repeats or
711 len(visited) >= self.max_redirections):
712 raise HTTPError(req.full_url, code,
713 self.inf_msg + msg, headers, fp)
714 else:
715 visited = new.redirect_dict = req.redirect_dict = {}
716 visited[newurl] = visited.get(newurl, 0) + 1
717
718 # Don't close the fp until we are sure that we won't use it
719 # with HTTPError.
720 fp.read()
721 fp.close()
722
723 return self.parent.open(new, timeout=req.timeout)
724
725 http_error_301 = http_error_303 = http_error_307 = http_error_302
726
727 inf_msg = "The HTTP server returned a redirect error that would " \
728 "lead to an infinite loop.\n" \
729 "The last 30x error message was:\n"
730
731
732def _parse_proxy(proxy):
733 """Return (scheme, user, password, host/port) given a URL or an authority.
734
735 If a URL is supplied, it must have an authority (host:port) component.
736 According to RFC 3986, having an authority component means the URL must
737 have two slashes after the scheme:
738
739 >>> _parse_proxy('file:/ftp.example.com/')
740 Traceback (most recent call last):
741 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
742
743 The first three items of the returned tuple may be None.
744
745 Examples of authority parsing:
746
747 >>> _parse_proxy('proxy.example.com')
748 (None, None, None, 'proxy.example.com')
749 >>> _parse_proxy('proxy.example.com:3128')
750 (None, None, None, 'proxy.example.com:3128')
751
752 The authority component may optionally include userinfo (assumed to be
753 username:password):
754
755 >>> _parse_proxy('joe:password@proxy.example.com')
756 (None, 'joe', 'password', 'proxy.example.com')
757 >>> _parse_proxy('joe:password@proxy.example.com:3128')
758 (None, 'joe', 'password', 'proxy.example.com:3128')
759
760 Same examples, but with URLs instead:
761
762 >>> _parse_proxy('http://proxy.example.com/')
763 ('http', None, None, 'proxy.example.com')
764 >>> _parse_proxy('http://proxy.example.com:3128/')
765 ('http', None, None, 'proxy.example.com:3128')
766 >>> _parse_proxy('http://joe:password@proxy.example.com/')
767 ('http', 'joe', 'password', 'proxy.example.com')
768 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
769 ('http', 'joe', 'password', 'proxy.example.com:3128')
770
771 Everything after the authority is ignored:
772
773 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
774 ('ftp', 'joe', 'password', 'proxy.example.com')
775
776 Test for no trailing '/' case:
777
778 >>> _parse_proxy('http://joe:password@proxy.example.com')
779 ('http', 'joe', 'password', 'proxy.example.com')
780
781 """
782 scheme, r_scheme = splittype(proxy)
783 if not r_scheme.startswith("/"):
784 # authority
785 scheme = None
786 authority = proxy
787 else:
788 # URL
789 if not r_scheme.startswith("//"):
790 raise ValueError("proxy URL with no authority: %r" % proxy)
791 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
792 # and 3.3.), path is empty or starts with '/'
793 end = r_scheme.find("/", 2)
794 if end == -1:
795 end = None
796 authority = r_scheme[2:end]
797 userinfo, hostport = splituser(authority)
798 if userinfo is not None:
799 user, password = splitpasswd(userinfo)
800 else:
801 user = password = None
802 return scheme, user, password, hostport
803
804class ProxyHandler(BaseHandler):
805 # Proxies must be in front
806 handler_order = 100
807
808 def __init__(self, proxies=None):
809 if proxies is None:
810 proxies = getproxies()
811 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
812 self.proxies = proxies
813 for type, url in proxies.items():
814 setattr(self, '%s_open' % type,
815 lambda r, proxy=url, type=type, meth=self.proxy_open:
816 meth(r, proxy, type))
817
818 def proxy_open(self, req, proxy, type):
819 orig_type = req.type
820 proxy_type, user, password, hostport = _parse_proxy(proxy)
821 if proxy_type is None:
822 proxy_type = orig_type
823
824 if req.host and proxy_bypass(req.host):
825 return None
826
827 if user and password:
828 user_pass = '%s:%s' % (unquote(user),
829 unquote(password))
830 creds = base64.b64encode(user_pass.encode()).decode("ascii")
831 req.add_header('Proxy-authorization', 'Basic ' + creds)
832 hostport = unquote(hostport)
833 req.set_proxy(hostport, proxy_type)
834 if orig_type == proxy_type or orig_type == 'https':
835 # let other handlers take care of it
836 return None
837 else:
838 # need to start over, because the other handlers don't
839 # grok the proxy's URL type
840 # e.g. if we have a constructor arg proxies like so:
841 # {'http': 'ftp://proxy.example.com'}, we may end up turning
842 # a request for http://acme.example.com/a into one for
843 # ftp://proxy.example.com/a
844 return self.parent.open(req, timeout=req.timeout)
845
846class HTTPPasswordMgr(object):
847
848 def __init__(self):
849 self.passwd = {}
850
851 def add_password(self, realm, uri, user, passwd):
852 # uri could be a single URI or a sequence
853 if isinstance(uri, str):
854 uri = [uri]
855 if realm not in self.passwd:
856 self.passwd[realm] = {}
857 for default_port in True, False:
858 reduced_uri = tuple(
859 [self.reduce_uri(u, default_port) for u in uri])
860 self.passwd[realm][reduced_uri] = (user, passwd)
861
862 def find_user_password(self, realm, authuri):
863 domains = self.passwd.get(realm, {})
864 for default_port in True, False:
865 reduced_authuri = self.reduce_uri(authuri, default_port)
866 for uris, authinfo in domains.items():
867 for uri in uris:
868 if self.is_suburi(uri, reduced_authuri):
869 return authinfo
870 return None, None
871
872 def reduce_uri(self, uri, default_port=True):
873 """Accept authority or URI and extract only the authority and path."""
874 # note HTTP URLs do not have a userinfo component
875 parts = urlsplit(uri)
876 if parts[1]:
877 # URI
878 scheme = parts[0]
879 authority = parts[1]
880 path = parts[2] or '/'
881 else:
882 # host or host:port
883 scheme = None
884 authority = uri
885 path = '/'
886 host, port = splitport(authority)
887 if default_port and port is None and scheme is not None:
888 dport = {"http": 80,
889 "https": 443,
890 }.get(scheme)
891 if dport is not None:
892 authority = "%s:%d" % (host, dport)
893 return authority, path
894
895 def is_suburi(self, base, test):
896 """Check if test is below base in a URI tree
897
898 Both args must be URIs in reduced form.
899 """
900 if base == test:
901 return True
902 if base[0] != test[0]:
903 return False
904 common = posixpath.commonprefix((base[1], test[1]))
905 if len(common) == len(base[1]):
906 return True
907 return False
908
909
910class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
911
912 def find_user_password(self, realm, authuri):
913 user, password = HTTPPasswordMgr.find_user_password(self, realm,
914 authuri)
915 if user is not None:
916 return user, password
917 return HTTPPasswordMgr.find_user_password(self, None, authuri)
918
919
920class AbstractBasicAuthHandler(object):
921
922 # XXX this allows for multiple auth-schemes, but will stupidly pick
923 # the last one with a realm specified.
924
925 # allow for double- and single-quoted realm values
926 # (single quotes are a violation of the RFC, but appear in the wild)
927 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
928 'realm=(["\']?)([^"\']*)\\2', re.I)
929
930 # XXX could pre-emptively send auth info already accepted (RFC 2617,
931 # end of section 2, and section 1.2 immediately after "credentials"
932 # production).
933
934 def __init__(self, password_mgr=None):
935 if password_mgr is None:
936 password_mgr = HTTPPasswordMgr()
937 self.passwd = password_mgr
938 self.add_password = self.passwd.add_password
939 self.retried = 0
940
941 def reset_retry_count(self):
942 self.retried = 0
943
944 def http_error_auth_reqed(self, authreq, host, req, headers):
945 # host may be an authority (without userinfo) or a URL with an
946 # authority
947 # XXX could be multiple headers
948 authreq = headers.get(authreq, None)
949
950 if self.retried > 5:
951 # retry sending the username:password 5 times before failing.
952 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
953 headers, None)
954 else:
955 self.retried += 1
956
957 if authreq:
958 scheme = authreq.split()[0]
959 if scheme.lower() != 'basic':
960 raise ValueError("AbstractBasicAuthHandler does not"
961 " support the following scheme: '%s'" %
962 scheme)
963 else:
964 mo = AbstractBasicAuthHandler.rx.search(authreq)
965 if mo:
966 scheme, quote, realm = mo.groups()
967 if quote not in ['"',"'"]:
968 warnings.warn("Basic Auth Realm was unquoted",
969 UserWarning, 2)
970 if scheme.lower() == 'basic':
971 response = self.retry_http_basic_auth(host, req, realm)
972 if response and response.code != 401:
973 self.retried = 0
974 return response
975
976 def retry_http_basic_auth(self, host, req, realm):
977 user, pw = self.passwd.find_user_password(realm, host)
978 if pw is not None:
979 raw = "%s:%s" % (user, pw)
980 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
981 if req.headers.get(self.auth_header, None) == auth:
982 return None
983 req.add_unredirected_header(self.auth_header, auth)
984 return self.parent.open(req, timeout=req.timeout)
985 else:
986 return None
987
988
989class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
990
991 auth_header = 'Authorization'
992
993 def http_error_401(self, req, fp, code, msg, headers):
994 url = req.full_url
995 response = self.http_error_auth_reqed('www-authenticate',
996 url, req, headers)
997 self.reset_retry_count()
998 return response
999
1000
1001class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1002
1003 auth_header = 'Proxy-authorization'
1004
1005 def http_error_407(self, req, fp, code, msg, headers):
1006 # http_error_auth_reqed requires that there is no userinfo component in
1007 # authority. Assume there isn't one, since urllib.request does not (and
1008 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1009 # userinfo.
1010 authority = req.host
1011 response = self.http_error_auth_reqed('proxy-authenticate',
1012 authority, req, headers)
1013 self.reset_retry_count()
1014 return response
1015
1016
1017# Return n random bytes.
1018_randombytes = os.urandom
1019
1020
1021class AbstractDigestAuthHandler(object):
1022 # Digest authentication is specified in RFC 2617.
1023
1024 # XXX The client does not inspect the Authentication-Info header
1025 # in a successful response.
1026
1027 # XXX It should be possible to test this implementation against
1028 # a mock server that just generates a static set of challenges.
1029
1030 # XXX qop="auth-int" supports is shaky
1031
1032 def __init__(self, passwd=None):
1033 if passwd is None:
1034 passwd = HTTPPasswordMgr()
1035 self.passwd = passwd
1036 self.add_password = self.passwd.add_password
1037 self.retried = 0
1038 self.nonce_count = 0
1039 self.last_nonce = None
1040
1041 def reset_retry_count(self):
1042 self.retried = 0
1043
1044 def http_error_auth_reqed(self, auth_header, host, req, headers):
1045 authreq = headers.get(auth_header, None)
1046 if self.retried > 5:
1047 # Don't fail endlessly - if we failed once, we'll probably
1048 # fail a second time. Hm. Unless the Password Manager is
1049 # prompting for the information. Crap. This isn't great
1050 # but it's better than the current 'repeat until recursion
1051 # depth exceeded' approach <wink>
1052 raise HTTPError(req.full_url, 401, "digest auth failed",
1053 headers, None)
1054 else:
1055 self.retried += 1
1056 if authreq:
1057 scheme = authreq.split()[0]
1058 if scheme.lower() == 'digest':
1059 return self.retry_http_digest_auth(req, authreq)
1060 elif scheme.lower() != 'basic':
1061 raise ValueError("AbstractDigestAuthHandler does not support"
1062 " the following scheme: '%s'" % scheme)
1063
1064 def retry_http_digest_auth(self, req, auth):
1065 token, challenge = auth.split(' ', 1)
1066 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1067 auth = self.get_authorization(req, chal)
1068 if auth:
1069 auth_val = 'Digest %s' % auth
1070 if req.headers.get(self.auth_header, None) == auth_val:
1071 return None
1072 req.add_unredirected_header(self.auth_header, auth_val)
1073 resp = self.parent.open(req, timeout=req.timeout)
1074 return resp
1075
1076 def get_cnonce(self, nonce):
1077 # The cnonce-value is an opaque
1078 # quoted string value provided by the client and used by both client
1079 # and server to avoid chosen plaintext attacks, to provide mutual
1080 # authentication, and to provide some message integrity protection.
1081 # This isn't a fabulous effort, but it's probably Good Enough.
1082 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1083 b = s.encode("ascii") + _randombytes(8)
1084 dig = hashlib.sha1(b).hexdigest()
1085 return dig[:16]
1086
1087 def get_authorization(self, req, chal):
1088 try:
1089 realm = chal['realm']
1090 nonce = chal['nonce']
1091 qop = chal.get('qop')
1092 algorithm = chal.get('algorithm', 'MD5')
1093 # mod_digest doesn't send an opaque, even though it isn't
1094 # supposed to be optional
1095 opaque = chal.get('opaque', None)
1096 except KeyError:
1097 return None
1098
1099 H, KD = self.get_algorithm_impls(algorithm)
1100 if H is None:
1101 return None
1102
1103 user, pw = self.passwd.find_user_password(realm, req.full_url)
1104 if user is None:
1105 return None
1106
1107 # XXX not implemented yet
1108 if req.data is not None:
1109 entdig = self.get_entity_digest(req.data, chal)
1110 else:
1111 entdig = None
1112
1113 A1 = "%s:%s:%s" % (user, realm, pw)
1114 A2 = "%s:%s" % (req.get_method(),
1115 # XXX selector: what about proxies and full urls
1116 req.selector)
1117 if qop == 'auth':
1118 if nonce == self.last_nonce:
1119 self.nonce_count += 1
1120 else:
1121 self.nonce_count = 1
1122 self.last_nonce = nonce
1123 ncvalue = '%08x' % self.nonce_count
1124 cnonce = self.get_cnonce(nonce)
1125 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1126 respdig = KD(H(A1), noncebit)
1127 elif qop is None:
1128 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1129 else:
1130 # XXX handle auth-int.
1131 raise URLError("qop '%s' is not supported." % qop)
1132
1133 # XXX should the partial digests be encoded too?
1134
1135 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1136 'response="%s"' % (user, realm, nonce, req.selector,
1137 respdig)
1138 if opaque:
1139 base += ', opaque="%s"' % opaque
1140 if entdig:
1141 base += ', digest="%s"' % entdig
1142 base += ', algorithm="%s"' % algorithm
1143 if qop:
1144 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1145 return base
1146
1147 def get_algorithm_impls(self, algorithm):
1148 # lambdas assume digest modules are imported at the top level
1149 if algorithm == 'MD5':
1150 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1151 elif algorithm == 'SHA':
1152 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1153 # XXX MD5-sess
1154 KD = lambda s, d: H("%s:%s" % (s, d))
1155 return H, KD
1156
1157 def get_entity_digest(self, data, chal):
1158 # XXX not implemented yet
1159 return None
1160
1161
1162class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1163 """An authentication protocol defined by RFC 2069
1164
1165 Digest authentication improves on basic authentication because it
1166 does not transmit passwords in the clear.
1167 """
1168
1169 auth_header = 'Authorization'
1170 handler_order = 490 # before Basic auth
1171
1172 def http_error_401(self, req, fp, code, msg, headers):
1173 host = urlparse(req.full_url)[1]
1174 retry = self.http_error_auth_reqed('www-authenticate',
1175 host, req, headers)
1176 self.reset_retry_count()
1177 return retry
1178
1179
1180class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1181
1182 auth_header = 'Proxy-Authorization'
1183 handler_order = 490 # before Basic auth
1184
1185 def http_error_407(self, req, fp, code, msg, headers):
1186 host = req.host
1187 retry = self.http_error_auth_reqed('proxy-authenticate',
1188 host, req, headers)
1189 self.reset_retry_count()
1190 return retry
1191
1192class AbstractHTTPHandler(BaseHandler):
1193
1194 def __init__(self, debuglevel=0):
1195 self._debuglevel = debuglevel
1196
1197 def set_http_debuglevel(self, level):
1198 self._debuglevel = level
1199
1200 def do_request_(self, request):
1201 host = request.host
1202 if not host:
1203 raise URLError('no host given')
1204
1205 if request.data is not None: # POST
1206 data = request.data
1207 if isinstance(data, str):
1208 msg = "POST data should be bytes or an iterable of bytes. " \
1209 "It cannot be of type str."
1210 raise TypeError(msg)
1211 if not request.has_header('Content-type'):
1212 request.add_unredirected_header(
1213 'Content-type',
1214 'application/x-www-form-urlencoded')
1215 if not request.has_header('Content-length'):
1216 size = None
1217 try:
1218 ### For Python-Future:
1219 if PY2 and isinstance(data, array.array):
1220 # memoryviews of arrays aren't supported
1221 # in Py2.7. (e.g. memoryview(array.array('I',
1222 # [1, 2, 3, 4])) raises a TypeError.)
1223 # So we calculate the size manually instead:
1224 size = len(data) * data.itemsize
1225 ###
1226 else:
1227 mv = memoryview(data)
1228 size = len(mv) * mv.itemsize
1229 except TypeError:
1230 if isinstance(data, Iterable):
1231 raise ValueError("Content-Length should be specified "
1232 "for iterable data of type %r %r" % (type(data),
1233 data))
1234 else:
1235 request.add_unredirected_header(
1236 'Content-length', '%d' % size)
1237
1238 sel_host = host
1239 if request.has_proxy():
1240 scheme, sel = splittype(request.selector)
1241 sel_host, sel_path = splithost(sel)
1242 if not request.has_header('Host'):
1243 request.add_unredirected_header('Host', sel_host)
1244 for name, value in self.parent.addheaders:
1245 name = name.capitalize()
1246 if not request.has_header(name):
1247 request.add_unredirected_header(name, value)
1248
1249 return request
1250
1251 def do_open(self, http_class, req, **http_conn_args):
1252 """Return an HTTPResponse object for the request, using http_class.
1253
1254 http_class must implement the HTTPConnection API from http.client.
1255 """
1256 host = req.host
1257 if not host:
1258 raise URLError('no host given')
1259
1260 # will parse host:port
1261 h = http_class(host, timeout=req.timeout, **http_conn_args)
1262
1263 headers = dict(req.unredirected_hdrs)
1264 headers.update(dict((k, v) for k, v in req.headers.items()
1265 if k not in headers))
1266
1267 # TODO(jhylton): Should this be redesigned to handle
1268 # persistent connections?
1269
1270 # We want to make an HTTP/1.1 request, but the addinfourl
1271 # class isn't prepared to deal with a persistent connection.
1272 # It will try to read all remaining data from the socket,
1273 # which will block while the server waits for the next request.
1274 # So make sure the connection gets closed after the (only)
1275 # request.
1276 headers["Connection"] = "close"
1277 headers = dict((name.title(), val) for name, val in headers.items())
1278
1279 if req._tunnel_host:
1280 tunnel_headers = {}
1281 proxy_auth_hdr = "Proxy-Authorization"
1282 if proxy_auth_hdr in headers:
1283 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1284 # Proxy-Authorization should not be sent to origin
1285 # server.
1286 del headers[proxy_auth_hdr]
1287 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1288
1289 try:
1290 h.request(req.get_method(), req.selector, req.data, headers)
1291 except socket.error as err: # timeout error
1292 h.close()
1293 raise URLError(err)
1294 else:
1295 r = h.getresponse()
1296 # If the server does not send us a 'Connection: close' header,
1297 # HTTPConnection assumes the socket should be left open. Manually
1298 # mark the socket to be closed when this response object goes away.
1299 if h.sock:
1300 h.sock.close()
1301 h.sock = None
1302
1303
1304 r.url = req.get_full_url()
1305 # This line replaces the .msg attribute of the HTTPResponse
1306 # with .headers, because urllib clients expect the response to
1307 # have the reason in .msg. It would be good to mark this
1308 # attribute is deprecated and get then to use info() or
1309 # .headers.
1310 r.msg = r.reason
1311 return r
1312
1313
1314class HTTPHandler(AbstractHTTPHandler):
1315
1316 def http_open(self, req):
1317 return self.do_open(http_client.HTTPConnection, req)
1318
1319 http_request = AbstractHTTPHandler.do_request_
1320
1321if hasattr(http_client, 'HTTPSConnection'):
1322
1323 class HTTPSHandler(AbstractHTTPHandler):
1324
1325 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1326 AbstractHTTPHandler.__init__(self, debuglevel)
1327 self._context = context
1328 self._check_hostname = check_hostname
1329
1330 def https_open(self, req):
1331 return self.do_open(http_client.HTTPSConnection, req,
1332 context=self._context, check_hostname=self._check_hostname)
1333
1334 https_request = AbstractHTTPHandler.do_request_
1335
1336 __all__.append('HTTPSHandler')
1337
1338class HTTPCookieProcessor(BaseHandler):
1339 def __init__(self, cookiejar=None):
1340 import future.backports.http.cookiejar as http_cookiejar
1341 if cookiejar is None:
1342 cookiejar = http_cookiejar.CookieJar()
1343 self.cookiejar = cookiejar
1344
1345 def http_request(self, request):
1346 self.cookiejar.add_cookie_header(request)
1347 return request
1348
1349 def http_response(self, request, response):
1350 self.cookiejar.extract_cookies(response, request)
1351 return response
1352
1353 https_request = http_request
1354 https_response = http_response
1355
1356class UnknownHandler(BaseHandler):
1357 def unknown_open(self, req):
1358 type = req.type
1359 raise URLError('unknown url type: %s' % type)
1360
1361def parse_keqv_list(l):
1362 """Parse list of key=value strings where keys are not duplicated."""
1363 parsed = {}
1364 for elt in l:
1365 k, v = elt.split('=', 1)
1366 if v[0] == '"' and v[-1] == '"':
1367 v = v[1:-1]
1368 parsed[k] = v
1369 return parsed
1370
1371def parse_http_list(s):
1372 """Parse lists as described by RFC 2068 Section 2.
1373
1374 In particular, parse comma-separated lists where the elements of
1375 the list may include quoted-strings. A quoted-string could
1376 contain a comma. A non-quoted string could have quotes in the
1377 middle. Neither commas nor quotes count if they are escaped.
1378 Only double-quotes count, not single-quotes.
1379 """
1380 res = []
1381 part = ''
1382
1383 escape = quote = False
1384 for cur in s:
1385 if escape:
1386 part += cur
1387 escape = False
1388 continue
1389 if quote:
1390 if cur == '\\':
1391 escape = True
1392 continue
1393 elif cur == '"':
1394 quote = False
1395 part += cur
1396 continue
1397
1398 if cur == ',':
1399 res.append(part)
1400 part = ''
1401 continue
1402
1403 if cur == '"':
1404 quote = True
1405
1406 part += cur
1407
1408 # append last part
1409 if part:
1410 res.append(part)
1411
1412 return [part.strip() for part in res]
1413
1414class FileHandler(BaseHandler):
1415 # Use local file or FTP depending on form of URL
1416 def file_open(self, req):
1417 url = req.selector
1418 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1419 req.host != 'localhost'):
1420 if not req.host is self.get_names():
1421 raise URLError("file:// scheme is supported only on localhost")
1422 else:
1423 return self.open_local_file(req)
1424
1425 # names for the localhost
1426 names = None
1427 def get_names(self):
1428 if FileHandler.names is None:
1429 try:
1430 FileHandler.names = tuple(
1431 socket.gethostbyname_ex('localhost')[2] +
1432 socket.gethostbyname_ex(socket.gethostname())[2])
1433 except socket.gaierror:
1434 FileHandler.names = (socket.gethostbyname('localhost'),)
1435 return FileHandler.names
1436
1437 # not entirely sure what the rules are here
1438 def open_local_file(self, req):
1439 import future.backports.email.utils as email_utils
1440 import mimetypes
1441 host = req.host
1442 filename = req.selector
1443 localfile = url2pathname(filename)
1444 try:
1445 stats = os.stat(localfile)
1446 size = stats.st_size
1447 modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1448 mtype = mimetypes.guess_type(filename)[0]
1449 headers = email.message_from_string(
1450 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1451 (mtype or 'text/plain', size, modified))
1452 if host:
1453 host, port = splitport(host)
1454 if not host or \
1455 (not port and _safe_gethostbyname(host) in self.get_names()):
1456 if host:
1457 origurl = 'file://' + host + filename
1458 else:
1459 origurl = 'file://' + filename
1460 return addinfourl(open(localfile, 'rb'), headers, origurl)
1461 except OSError as exp:
1462 # users shouldn't expect OSErrors coming from urlopen()
1463 raise URLError(exp)
1464 raise URLError('file not on local host')
1465
1466def _safe_gethostbyname(host):
1467 try:
1468 return socket.gethostbyname(host)
1469 except socket.gaierror:
1470 return None
1471
1472class FTPHandler(BaseHandler):
1473 def ftp_open(self, req):
1474 import ftplib
1475 import mimetypes
1476 host = req.host
1477 if not host:
1478 raise URLError('ftp error: no host given')
1479 host, port = splitport(host)
1480 if port is None:
1481 port = ftplib.FTP_PORT
1482 else:
1483 port = int(port)
1484
1485 # username/password handling
1486 user, host = splituser(host)
1487 if user:
1488 user, passwd = splitpasswd(user)
1489 else:
1490 passwd = None
1491 host = unquote(host)
1492 user = user or ''
1493 passwd = passwd or ''
1494
1495 try:
1496 host = socket.gethostbyname(host)
1497 except socket.error as msg:
1498 raise URLError(msg)
1499 path, attrs = splitattr(req.selector)
1500 dirs = path.split('/')
1501 dirs = list(map(unquote, dirs))
1502 dirs, file = dirs[:-1], dirs[-1]
1503 if dirs and not dirs[0]:
1504 dirs = dirs[1:]
1505 try:
1506 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1507 type = file and 'I' or 'D'
1508 for attr in attrs:
1509 attr, value = splitvalue(attr)
1510 if attr.lower() == 'type' and \
1511 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1512 type = value.upper()
1513 fp, retrlen = fw.retrfile(file, type)
1514 headers = ""
1515 mtype = mimetypes.guess_type(req.full_url)[0]
1516 if mtype:
1517 headers += "Content-type: %s\n" % mtype
1518 if retrlen is not None and retrlen >= 0:
1519 headers += "Content-length: %d\n" % retrlen
1520 headers = email.message_from_string(headers)
1521 return addinfourl(fp, headers, req.full_url)
1522 except ftplib.all_errors as exp:
1523 exc = URLError('ftp error: %r' % exp)
1524 raise_with_traceback(exc)
1525
1526 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1527 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1528 persistent=False)
1529
1530class CacheFTPHandler(FTPHandler):
1531 # XXX would be nice to have pluggable cache strategies
1532 # XXX this stuff is definitely not thread safe
1533 def __init__(self):
1534 self.cache = {}
1535 self.timeout = {}
1536 self.soonest = 0
1537 self.delay = 60
1538 self.max_conns = 16
1539
1540 def setTimeout(self, t):
1541 self.delay = t
1542
1543 def setMaxConns(self, m):
1544 self.max_conns = m
1545
1546 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1547 key = user, host, port, '/'.join(dirs), timeout
1548 if key in self.cache:
1549 self.timeout[key] = time.time() + self.delay
1550 else:
1551 self.cache[key] = ftpwrapper(user, passwd, host, port,
1552 dirs, timeout)
1553 self.timeout[key] = time.time() + self.delay
1554 self.check_cache()
1555 return self.cache[key]
1556
1557 def check_cache(self):
1558 # first check for old ones
1559 t = time.time()
1560 if self.soonest <= t:
1561 for k, v in list(self.timeout.items()):
1562 if v < t:
1563 self.cache[k].close()
1564 del self.cache[k]
1565 del self.timeout[k]
1566 self.soonest = min(list(self.timeout.values()))
1567
1568 # then check the size
1569 if len(self.cache) == self.max_conns:
1570 for k, v in list(self.timeout.items()):
1571 if v == self.soonest:
1572 del self.cache[k]
1573 del self.timeout[k]
1574 break
1575 self.soonest = min(list(self.timeout.values()))
1576
1577 def clear_cache(self):
1578 for conn in self.cache.values():
1579 conn.close()
1580 self.cache.clear()
1581 self.timeout.clear()
1582
1583
1584# Code move from the old urllib module
1585
1586MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1587
1588# Helper for non-unix systems
1589if os.name == 'nt':
1590 from nturl2path import url2pathname, pathname2url
1591else:
1592 def url2pathname(pathname):
1593 """OS-specific conversion from a relative URL of the 'file' scheme
1594 to a file system path; not recommended for general use."""
1595 return unquote(pathname)
1596
1597 def pathname2url(pathname):
1598 """OS-specific conversion from a file system path to a relative URL
1599 of the 'file' scheme; not recommended for general use."""
1600 return quote(pathname)
1601
1602# This really consists of two pieces:
1603# (1) a class which handles opening of all sorts of URLs
1604# (plus assorted utilities etc.)
1605# (2) a set of functions for parsing URLs
1606# XXX Should these be separated out into different modules?
1607
1608
1609ftpcache = {}
1610class URLopener(object):
1611 """Class to open URLs.
1612 This is a class rather than just a subroutine because we may need
1613 more than one set of global protocol-specific options.
1614 Note -- this is a base class for those who don't want the
1615 automatic handling of errors type 302 (relocated) and 401
1616 (authorization needed)."""
1617
1618 __tempfiles = None
1619
1620 version = "Python-urllib/%s" % __version__
1621
1622 # Constructor
1623 def __init__(self, proxies=None, **x509):
1624 msg = "%(class)s style of invoking requests is deprecated. " \
1625 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1626 warnings.warn(msg, DeprecationWarning, stacklevel=3)
1627 if proxies is None:
1628 proxies = getproxies()
1629 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1630 self.proxies = proxies
1631 self.key_file = x509.get('key_file')
1632 self.cert_file = x509.get('cert_file')
1633 self.addheaders = [('User-Agent', self.version)]
1634 self.__tempfiles = []
1635 self.__unlink = os.unlink # See cleanup()
1636 self.tempcache = None
1637 # Undocumented feature: if you assign {} to tempcache,
1638 # it is used to cache files retrieved with
1639 # self.retrieve(). This is not enabled by default
1640 # since it does not work for changing documents (and I
1641 # haven't got the logic to check expiration headers
1642 # yet).
1643 self.ftpcache = ftpcache
1644 # Undocumented feature: you can use a different
1645 # ftp cache by assigning to the .ftpcache member;
1646 # in case you want logically independent URL openers
1647 # XXX This is not threadsafe. Bah.
1648
1649 def __del__(self):
1650 self.close()
1651
1652 def close(self):
1653 self.cleanup()
1654
1655 def cleanup(self):
1656 # This code sometimes runs when the rest of this module
1657 # has already been deleted, so it can't use any globals
1658 # or import anything.
1659 if self.__tempfiles:
1660 for file in self.__tempfiles:
1661 try:
1662 self.__unlink(file)
1663 except OSError:
1664 pass
1665 del self.__tempfiles[:]
1666 if self.tempcache:
1667 self.tempcache.clear()
1668
1669 def addheader(self, *args):
1670 """Add a header to be used by the HTTP interface only
1671 e.g. u.addheader('Accept', 'sound/basic')"""
1672 self.addheaders.append(args)
1673
1674 # External interface
1675 def open(self, fullurl, data=None):
1676 """Use URLopener().open(file) instead of open(file, 'r')."""
1677 fullurl = unwrap(to_bytes(fullurl))
1678 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1679 if self.tempcache and fullurl in self.tempcache:
1680 filename, headers = self.tempcache[fullurl]
1681 fp = open(filename, 'rb')
1682 return addinfourl(fp, headers, fullurl)
1683 urltype, url = splittype(fullurl)
1684 if not urltype:
1685 urltype = 'file'
1686 if urltype in self.proxies:
1687 proxy = self.proxies[urltype]
1688 urltype, proxyhost = splittype(proxy)
1689 host, selector = splithost(proxyhost)
1690 url = (host, fullurl) # Signal special case to open_*()
1691 else:
1692 proxy = None
1693 name = 'open_' + urltype
1694 self.type = urltype
1695 name = name.replace('-', '_')
1696 if not hasattr(self, name):
1697 if proxy:
1698 return self.open_unknown_proxy(proxy, fullurl, data)
1699 else:
1700 return self.open_unknown(fullurl, data)
1701 try:
1702 if data is None:
1703 return getattr(self, name)(url)
1704 else:
1705 return getattr(self, name)(url, data)
1706 except HTTPError:
1707 raise
1708 except socket.error as msg:
1709 raise_with_traceback(IOError('socket error', msg))
1710
1711 def open_unknown(self, fullurl, data=None):
1712 """Overridable interface to open unknown URL type."""
1713 type, url = splittype(fullurl)
1714 raise IOError('url error', 'unknown url type', type)
1715
1716 def open_unknown_proxy(self, proxy, fullurl, data=None):
1717 """Overridable interface to open unknown URL type."""
1718 type, url = splittype(fullurl)
1719 raise IOError('url error', 'invalid proxy for %s' % type, proxy)
1720
1721 # External interface
1722 def retrieve(self, url, filename=None, reporthook=None, data=None):
1723 """retrieve(url) returns (filename, headers) for a local object
1724 or (tempfilename, headers) for a remote object."""
1725 url = unwrap(to_bytes(url))
1726 if self.tempcache and url in self.tempcache:
1727 return self.tempcache[url]
1728 type, url1 = splittype(url)
1729 if filename is None and (not type or type == 'file'):
1730 try:
1731 fp = self.open_local_file(url1)
1732 hdrs = fp.info()
1733 fp.close()
1734 return url2pathname(splithost(url1)[1]), hdrs
1735 except IOError as msg:
1736 pass
1737 fp = self.open(url, data)
1738 try:
1739 headers = fp.info()
1740 if filename:
1741 tfp = open(filename, 'wb')
1742 else:
1743 import tempfile
1744 garbage, path = splittype(url)
1745 garbage, path = splithost(path or "")
1746 path, garbage = splitquery(path or "")
1747 path, garbage = splitattr(path or "")
1748 suffix = os.path.splitext(path)[1]
1749 (fd, filename) = tempfile.mkstemp(suffix)
1750 self.__tempfiles.append(filename)
1751 tfp = os.fdopen(fd, 'wb')
1752 try:
1753 result = filename, headers
1754 if self.tempcache is not None:
1755 self.tempcache[url] = result
1756 bs = 1024*8
1757 size = -1
1758 read = 0
1759 blocknum = 0
1760 if "content-length" in headers:
1761 size = int(headers["Content-Length"])
1762 if reporthook:
1763 reporthook(blocknum, bs, size)
1764 while 1:
1765 block = fp.read(bs)
1766 if not block:
1767 break
1768 read += len(block)
1769 tfp.write(block)
1770 blocknum += 1
1771 if reporthook:
1772 reporthook(blocknum, bs, size)
1773 finally:
1774 tfp.close()
1775 finally:
1776 fp.close()
1777
1778 # raise exception if actual size does not match content-length header
1779 if size >= 0 and read < size:
1780 raise ContentTooShortError(
1781 "retrieval incomplete: got only %i out of %i bytes"
1782 % (read, size), result)
1783
1784 return result
1785
1786 # Each method named open_<type> knows how to open that type of URL
1787
1788 def _open_generic_http(self, connection_factory, url, data):
1789 """Make an HTTP connection using connection_class.
1790
1791 This is an internal method that should be called from
1792 open_http() or open_https().
1793
1794 Arguments:
1795 - connection_factory should take a host name and return an
1796 HTTPConnection instance.
1797 - url is the url to retrieval or a host, relative-path pair.
1798 - data is payload for a POST request or None.
1799 """
1800
1801 user_passwd = None
1802 proxy_passwd= None
1803 if isinstance(url, str):
1804 host, selector = splithost(url)
1805 if host:
1806 user_passwd, host = splituser(host)
1807 host = unquote(host)
1808 realhost = host
1809 else:
1810 host, selector = url
1811 # check whether the proxy contains authorization information
1812 proxy_passwd, host = splituser(host)
1813 # now we proceed with the url we want to obtain
1814 urltype, rest = splittype(selector)
1815 url = rest
1816 user_passwd = None
1817 if urltype.lower() != 'http':
1818 realhost = None
1819 else:
1820 realhost, rest = splithost(rest)
1821 if realhost:
1822 user_passwd, realhost = splituser(realhost)
1823 if user_passwd:
1824 selector = "%s://%s%s" % (urltype, realhost, rest)
1825 if proxy_bypass(realhost):
1826 host = realhost
1827
1828 if not host: raise IOError('http error', 'no host given')
1829
1830 if proxy_passwd:
1831 proxy_passwd = unquote(proxy_passwd)
1832 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1833 else:
1834 proxy_auth = None
1835
1836 if user_passwd:
1837 user_passwd = unquote(user_passwd)
1838 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1839 else:
1840 auth = None
1841 http_conn = connection_factory(host)
1842 headers = {}
1843 if proxy_auth:
1844 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1845 if auth:
1846 headers["Authorization"] = "Basic %s" % auth
1847 if realhost:
1848 headers["Host"] = realhost
1849
1850 # Add Connection:close as we don't support persistent connections yet.
1851 # This helps in closing the socket and avoiding ResourceWarning
1852
1853 headers["Connection"] = "close"
1854
1855 for header, value in self.addheaders:
1856 headers[header] = value
1857
1858 if data is not None:
1859 headers["Content-Type"] = "application/x-www-form-urlencoded"
1860 http_conn.request("POST", selector, data, headers)
1861 else:
1862 http_conn.request("GET", selector, headers=headers)
1863
1864 try:
1865 response = http_conn.getresponse()
1866 except http_client.BadStatusLine:
1867 # something went wrong with the HTTP status line
1868 raise URLError("http protocol error: bad status line")
1869
1870 # According to RFC 2616, "2xx" code indicates that the client's
1871 # request was successfully received, understood, and accepted.
1872 if 200 <= response.status < 300:
1873 return addinfourl(response, response.msg, "http:" + url,
1874 response.status)
1875 else:
1876 return self.http_error(
1877 url, response.fp,
1878 response.status, response.reason, response.msg, data)
1879
1880 def open_http(self, url, data=None):
1881 """Use HTTP protocol."""
1882 return self._open_generic_http(http_client.HTTPConnection, url, data)
1883
1884 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1885 """Handle http errors.
1886
1887 Derived class can override this, or provide specific handlers
1888 named http_error_DDD where DDD is the 3-digit error code."""
1889 # First check if there's a specific handler for this error
1890 name = 'http_error_%d' % errcode
1891 if hasattr(self, name):
1892 method = getattr(self, name)
1893 if data is None:
1894 result = method(url, fp, errcode, errmsg, headers)
1895 else:
1896 result = method(url, fp, errcode, errmsg, headers, data)
1897 if result: return result
1898 return self.http_error_default(url, fp, errcode, errmsg, headers)
1899
1900 def http_error_default(self, url, fp, errcode, errmsg, headers):
1901 """Default error handler: close the connection and raise IOError."""
1902 fp.close()
1903 raise HTTPError(url, errcode, errmsg, headers, None)
1904
1905 if _have_ssl:
1906 def _https_connection(self, host):
1907 return http_client.HTTPSConnection(host,
1908 key_file=self.key_file,
1909 cert_file=self.cert_file)
1910
1911 def open_https(self, url, data=None):
1912 """Use HTTPS protocol."""
1913 return self._open_generic_http(self._https_connection, url, data)
1914
1915 def open_file(self, url):
1916 """Use local file or FTP depending on form of URL."""
1917 if not isinstance(url, str):
1918 raise URLError('file error: proxy support for file protocol currently not implemented')
1919 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1920 raise ValueError("file:// scheme is supported only on localhost")
1921 else:
1922 return self.open_local_file(url)
1923
1924 def open_local_file(self, url):
1925 """Use local file."""
1926 import future.backports.email.utils as email_utils
1927 import mimetypes
1928 host, file = splithost(url)
1929 localname = url2pathname(file)
1930 try:
1931 stats = os.stat(localname)
1932 except OSError as e:
1933 raise URLError(e.strerror, e.filename)
1934 size = stats.st_size
1935 modified = email_utils.formatdate(stats.st_mtime, usegmt=True)
1936 mtype = mimetypes.guess_type(url)[0]
1937 headers = email.message_from_string(
1938 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1939 (mtype or 'text/plain', size, modified))
1940 if not host:
1941 urlfile = file
1942 if file[:1] == '/':
1943 urlfile = 'file://' + file
1944 return addinfourl(open(localname, 'rb'), headers, urlfile)
1945 host, port = splitport(host)
1946 if (not port
1947 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
1948 urlfile = file
1949 if file[:1] == '/':
1950 urlfile = 'file://' + file
1951 elif file[:2] == './':
1952 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
1953 return addinfourl(open(localname, 'rb'), headers, urlfile)
1954 raise URLError('local file error: not on local host')
1955
1956 def open_ftp(self, url):
1957 """Use FTP protocol."""
1958 if not isinstance(url, str):
1959 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
1960 import mimetypes
1961 host, path = splithost(url)
1962 if not host: raise URLError('ftp error: no host given')
1963 host, port = splitport(host)
1964 user, host = splituser(host)
1965 if user: user, passwd = splitpasswd(user)
1966 else: passwd = None
1967 host = unquote(host)
1968 user = unquote(user or '')
1969 passwd = unquote(passwd or '')
1970 host = socket.gethostbyname(host)
1971 if not port:
1972 import ftplib
1973 port = ftplib.FTP_PORT
1974 else:
1975 port = int(port)
1976 path, attrs = splitattr(path)
1977 path = unquote(path)
1978 dirs = path.split('/')
1979 dirs, file = dirs[:-1], dirs[-1]
1980 if dirs and not dirs[0]: dirs = dirs[1:]
1981 if dirs and not dirs[0]: dirs[0] = '/'
1982 key = user, host, port, '/'.join(dirs)
1983 # XXX thread unsafe!
1984 if len(self.ftpcache) > MAXFTPCACHE:
1985 # Prune the cache, rather arbitrarily
1986 for k in self.ftpcache.keys():
1987 if k != key:
1988 v = self.ftpcache[k]
1989 del self.ftpcache[k]
1990 v.close()
1991 try:
1992 if key not in self.ftpcache:
1993 self.ftpcache[key] = \
1994 ftpwrapper(user, passwd, host, port, dirs)
1995 if not file: type = 'D'
1996 else: type = 'I'
1997 for attr in attrs:
1998 attr, value = splitvalue(attr)
1999 if attr.lower() == 'type' and \
2000 value in ('a', 'A', 'i', 'I', 'd', 'D'):
2001 type = value.upper()
2002 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2003 mtype = mimetypes.guess_type("ftp:" + url)[0]
2004 headers = ""
2005 if mtype:
2006 headers += "Content-Type: %s\n" % mtype
2007 if retrlen is not None and retrlen >= 0:
2008 headers += "Content-Length: %d\n" % retrlen
2009 headers = email.message_from_string(headers)
2010 return addinfourl(fp, headers, "ftp:" + url)
2011 except ftperrors() as exp:
2012 raise_with_traceback(URLError('ftp error %r' % exp))
2013
2014 def open_data(self, url, data=None):
2015 """Use "data" URL."""
2016 if not isinstance(url, str):
2017 raise URLError('data error: proxy support for data protocol currently not implemented')
2018 # ignore POSTed data
2019 #
2020 # syntax of data URLs:
2021 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
2022 # mediatype := [ type "/" subtype ] *( ";" parameter )
2023 # data := *urlchar
2024 # parameter := attribute "=" value
2025 try:
2026 [type, data] = url.split(',', 1)
2027 except ValueError:
2028 raise IOError('data error', 'bad data URL')
2029 if not type:
2030 type = 'text/plain;charset=US-ASCII'
2031 semi = type.rfind(';')
2032 if semi >= 0 and '=' not in type[semi:]:
2033 encoding = type[semi+1:]
2034 type = type[:semi]
2035 else:
2036 encoding = ''
2037 msg = []
2038 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2039 time.gmtime(time.time())))
2040 msg.append('Content-type: %s' % type)
2041 if encoding == 'base64':
2042 # XXX is this encoding/decoding ok?
2043 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2044 else:
2045 data = unquote(data)
2046 msg.append('Content-Length: %d' % len(data))
2047 msg.append('')
2048 msg.append(data)
2049 msg = '\n'.join(msg)
2050 headers = email.message_from_string(msg)
2051 f = io.StringIO(msg)
2052 #f.fileno = None # needed for addinfourl
2053 return addinfourl(f, headers, url)
2054
2055
2056class FancyURLopener(URLopener):
2057 """Derived class with handlers for errors we can handle (perhaps)."""
2058
2059 def __init__(self, *args, **kwargs):
2060 URLopener.__init__(self, *args, **kwargs)
2061 self.auth_cache = {}
2062 self.tries = 0
2063 self.maxtries = 10
2064
2065 def http_error_default(self, url, fp, errcode, errmsg, headers):
2066 """Default error handling -- don't raise an exception."""
2067 return addinfourl(fp, headers, "http:" + url, errcode)
2068
2069 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2070 """Error 302 -- relocated (temporarily)."""
2071 self.tries += 1
2072 if self.maxtries and self.tries >= self.maxtries:
2073 if hasattr(self, "http_error_500"):
2074 meth = self.http_error_500
2075 else:
2076 meth = self.http_error_default
2077 self.tries = 0
2078 return meth(url, fp, 500,
2079 "Internal Server Error: Redirect Recursion", headers)
2080 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
2081 data)
2082 self.tries = 0
2083 return result
2084
2085 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2086 if 'location' in headers:
2087 newurl = headers['location']
2088 elif 'uri' in headers:
2089 newurl = headers['uri']
2090 else:
2091 return
2092 fp.close()
2093
2094 # In case the server sent a relative URL, join with original:
2095 newurl = urljoin(self.type + ":" + url, newurl)
2096
2097 urlparts = urlparse(newurl)
2098
2099 # For security reasons, we don't allow redirection to anything other
2100 # than http, https and ftp.
2101
2102 # We are using newer HTTPError with older redirect_internal method
2103 # This older method will get deprecated in 3.3
2104
2105 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2106 raise HTTPError(newurl, errcode,
2107 errmsg +
2108 " Redirection to url '%s' is not allowed." % newurl,
2109 headers, fp)
2110
2111 return self.open(newurl)
2112
2113 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2114 """Error 301 -- also relocated (permanently)."""
2115 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2116
2117 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2118 """Error 303 -- also relocated (essentially identical to 302)."""
2119 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2120
2121 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2122 """Error 307 -- relocated, but turn POST into error."""
2123 if data is None:
2124 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2125 else:
2126 return self.http_error_default(url, fp, errcode, errmsg, headers)
2127
2128 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2129 retry=False):
2130 """Error 401 -- authentication required.
2131 This function supports Basic authentication only."""
2132 if 'www-authenticate' not in headers:
2133 URLopener.http_error_default(self, url, fp,
2134 errcode, errmsg, headers)
2135 stuff = headers['www-authenticate']
2136 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2137 if not match:
2138 URLopener.http_error_default(self, url, fp,
2139 errcode, errmsg, headers)
2140 scheme, realm = match.groups()
2141 if scheme.lower() != 'basic':
2142 URLopener.http_error_default(self, url, fp,
2143 errcode, errmsg, headers)
2144 if not retry:
2145 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2146 headers)
2147 name = 'retry_' + self.type + '_basic_auth'
2148 if data is None:
2149 return getattr(self,name)(url, realm)
2150 else:
2151 return getattr(self,name)(url, realm, data)
2152
2153 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2154 retry=False):
2155 """Error 407 -- proxy authentication required.
2156 This function supports Basic authentication only."""
2157 if 'proxy-authenticate' not in headers:
2158 URLopener.http_error_default(self, url, fp,
2159 errcode, errmsg, headers)
2160 stuff = headers['proxy-authenticate']
2161 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2162 if not match:
2163 URLopener.http_error_default(self, url, fp,
2164 errcode, errmsg, headers)
2165 scheme, realm = match.groups()
2166 if scheme.lower() != 'basic':
2167 URLopener.http_error_default(self, url, fp,
2168 errcode, errmsg, headers)
2169 if not retry:
2170 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2171 headers)
2172 name = 'retry_proxy_' + self.type + '_basic_auth'
2173 if data is None:
2174 return getattr(self,name)(url, realm)
2175 else:
2176 return getattr(self,name)(url, realm, data)
2177
2178 def retry_proxy_http_basic_auth(self, url, realm, data=None):
2179 host, selector = splithost(url)
2180 newurl = 'http://' + host + selector
2181 proxy = self.proxies['http']
2182 urltype, proxyhost = splittype(proxy)
2183 proxyhost, proxyselector = splithost(proxyhost)
2184 i = proxyhost.find('@') + 1
2185 proxyhost = proxyhost[i:]
2186 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2187 if not (user or passwd): return None
2188 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2189 quote(passwd, safe=''), proxyhost)
2190 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2191 if data is None:
2192 return self.open(newurl)
2193 else:
2194 return self.open(newurl, data)
2195
2196 def retry_proxy_https_basic_auth(self, url, realm, data=None):
2197 host, selector = splithost(url)
2198 newurl = 'https://' + host + selector
2199 proxy = self.proxies['https']
2200 urltype, proxyhost = splittype(proxy)
2201 proxyhost, proxyselector = splithost(proxyhost)
2202 i = proxyhost.find('@') + 1
2203 proxyhost = proxyhost[i:]
2204 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2205 if not (user or passwd): return None
2206 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2207 quote(passwd, safe=''), proxyhost)
2208 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2209 if data is None:
2210 return self.open(newurl)
2211 else:
2212 return self.open(newurl, data)
2213
2214 def retry_http_basic_auth(self, url, realm, data=None):
2215 host, selector = splithost(url)
2216 i = host.find('@') + 1
2217 host = host[i:]
2218 user, passwd = self.get_user_passwd(host, realm, i)
2219 if not (user or passwd): return None
2220 host = "%s:%s@%s" % (quote(user, safe=''),
2221 quote(passwd, safe=''), host)
2222 newurl = 'http://' + host + selector
2223 if data is None:
2224 return self.open(newurl)
2225 else:
2226 return self.open(newurl, data)
2227
2228 def retry_https_basic_auth(self, url, realm, data=None):
2229 host, selector = splithost(url)
2230 i = host.find('@') + 1
2231 host = host[i:]
2232 user, passwd = self.get_user_passwd(host, realm, i)
2233 if not (user or passwd): return None
2234 host = "%s:%s@%s" % (quote(user, safe=''),
2235 quote(passwd, safe=''), host)
2236 newurl = 'https://' + host + selector
2237 if data is None:
2238 return self.open(newurl)
2239 else:
2240 return self.open(newurl, data)
2241
2242 def get_user_passwd(self, host, realm, clear_cache=0):
2243 key = realm + '@' + host.lower()
2244 if key in self.auth_cache:
2245 if clear_cache:
2246 del self.auth_cache[key]
2247 else:
2248 return self.auth_cache[key]
2249 user, passwd = self.prompt_user_passwd(host, realm)
2250 if user or passwd: self.auth_cache[key] = (user, passwd)
2251 return user, passwd
2252
2253 def prompt_user_passwd(self, host, realm):
2254 """Override this in a GUI environment!"""
2255 import getpass
2256 try:
2257 user = input("Enter username for %s at %s: " % (realm, host))
2258 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2259 (user, realm, host))
2260 return user, passwd
2261 except KeyboardInterrupt:
2262 print()
2263 return None, None
2264
2265
2266# Utility functions
2267
2268_localhost = None
2269def localhost():
2270 """Return the IP address of the magic hostname 'localhost'."""
2271 global _localhost
2272 if _localhost is None:
2273 _localhost = socket.gethostbyname('localhost')
2274 return _localhost
2275
2276_thishost = None
2277def thishost():
2278 """Return the IP addresses of the current host."""
2279 global _thishost
2280 if _thishost is None:
2281 try:
2282 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2283 except socket.gaierror:
2284 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2285 return _thishost
2286
2287_ftperrors = None
2288def ftperrors():
2289 """Return the set of errors raised by the FTP class."""
2290 global _ftperrors
2291 if _ftperrors is None:
2292 import ftplib
2293 _ftperrors = ftplib.all_errors
2294 return _ftperrors
2295
2296_noheaders = None
2297def noheaders():
2298 """Return an empty email Message object."""
2299 global _noheaders
2300 if _noheaders is None:
2301 _noheaders = email.message_from_string("")
2302 return _noheaders
2303
2304
2305# Utility classes
2306
2307class ftpwrapper(object):
2308 """Class used by open_ftp() for cache of open FTP connections."""
2309
2310 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2311 persistent=True):
2312 self.user = user
2313 self.passwd = passwd
2314 self.host = host
2315 self.port = port
2316 self.dirs = dirs
2317 self.timeout = timeout
2318 self.refcount = 0
2319 self.keepalive = persistent
2320 self.init()
2321
2322 def init(self):
2323 import ftplib
2324 self.busy = 0
2325 self.ftp = ftplib.FTP()
2326 self.ftp.connect(self.host, self.port, self.timeout)
2327 self.ftp.login(self.user, self.passwd)
2328 _target = '/'.join(self.dirs)
2329 self.ftp.cwd(_target)
2330
2331 def retrfile(self, file, type):
2332 import ftplib
2333 self.endtransfer()
2334 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2335 else: cmd = 'TYPE ' + type; isdir = 0
2336 try:
2337 self.ftp.voidcmd(cmd)
2338 except ftplib.all_errors:
2339 self.init()
2340 self.ftp.voidcmd(cmd)
2341 conn = None
2342 if file and not isdir:
2343 # Try to retrieve as a file
2344 try:
2345 cmd = 'RETR ' + file
2346 conn, retrlen = self.ftp.ntransfercmd(cmd)
2347 except ftplib.error_perm as reason:
2348 if str(reason)[:3] != '550':
2349 raise_with_traceback(URLError('ftp error: %r' % reason))
2350 if not conn:
2351 # Set transfer mode to ASCII!
2352 self.ftp.voidcmd('TYPE A')
2353 # Try a directory listing. Verify that directory exists.
2354 if file:
2355 pwd = self.ftp.pwd()
2356 try:
2357 try:
2358 self.ftp.cwd(file)
2359 except ftplib.error_perm as reason:
2360 ### Was:
2361 # raise URLError('ftp error: %r' % reason) from reason
2362 exc = URLError('ftp error: %r' % reason)
2363 exc.__cause__ = reason
2364 raise exc
2365 finally:
2366 self.ftp.cwd(pwd)
2367 cmd = 'LIST ' + file
2368 else:
2369 cmd = 'LIST'
2370 conn, retrlen = self.ftp.ntransfercmd(cmd)
2371 self.busy = 1
2372
2373 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2374 self.refcount += 1
2375 conn.close()
2376 # Pass back both a suitably decorated object and a retrieval length
2377 return (ftpobj, retrlen)
2378
2379 def endtransfer(self):
2380 self.busy = 0
2381
2382 def close(self):
2383 self.keepalive = False
2384 if self.refcount <= 0:
2385 self.real_close()
2386
2387 def file_close(self):
2388 self.endtransfer()
2389 self.refcount -= 1
2390 if self.refcount <= 0 and not self.keepalive:
2391 self.real_close()
2392
2393 def real_close(self):
2394 self.endtransfer()
2395 try:
2396 self.ftp.close()
2397 except ftperrors():
2398 pass
2399
2400# Proxy handling
2401def getproxies_environment():
2402 """Return a dictionary of scheme -> proxy server URL mappings.
2403
2404 Scan the environment for variables named <scheme>_proxy;
2405 this seems to be the standard convention. If you need a
2406 different way, you can pass a proxies dictionary to the
2407 [Fancy]URLopener constructor.
2408
2409 """
2410 proxies = {}
2411 for name, value in os.environ.items():
2412 name = name.lower()
2413 if value and name[-6:] == '_proxy':
2414 proxies[name[:-6]] = value
2415 return proxies
2416
2417def proxy_bypass_environment(host):
2418 """Test if proxies should not be used for a particular host.
2419
2420 Checks the environment for a variable named no_proxy, which should
2421 be a list of DNS suffixes separated by commas, or '*' for all hosts.
2422 """
2423 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
2424 # '*' is special case for always bypass
2425 if no_proxy == '*':
2426 return 1
2427 # strip port off host
2428 hostonly, port = splitport(host)
2429 # check if the host ends with any of the DNS suffixes
2430 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2431 for name in no_proxy_list:
2432 if name and (hostonly.endswith(name) or host.endswith(name)):
2433 return 1
2434 # otherwise, don't bypass
2435 return 0
2436
2437
2438# This code tests an OSX specific data structure but is testable on all
2439# platforms
2440def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2441 """
2442 Return True iff this host shouldn't be accessed using a proxy
2443
2444 This function uses the MacOSX framework SystemConfiguration
2445 to fetch the proxy information.
2446
2447 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2448 { 'exclude_simple': bool,
2449 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2450 }
2451 """
2452 from fnmatch import fnmatch
2453
2454 hostonly, port = splitport(host)
2455
2456 def ip2num(ipAddr):
2457 parts = ipAddr.split('.')
2458 parts = list(map(int, parts))
2459 if len(parts) != 4:
2460 parts = (parts + [0, 0, 0, 0])[:4]
2461 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2462
2463 # Check for simple host names:
2464 if '.' not in host:
2465 if proxy_settings['exclude_simple']:
2466 return True
2467
2468 hostIP = None
2469
2470 for value in proxy_settings.get('exceptions', ()):
2471 # Items in the list are strings like these: *.local, 169.254/16
2472 if not value: continue
2473
2474 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2475 if m is not None:
2476 if hostIP is None:
2477 try:
2478 hostIP = socket.gethostbyname(hostonly)
2479 hostIP = ip2num(hostIP)
2480 except socket.error:
2481 continue
2482
2483 base = ip2num(m.group(1))
2484 mask = m.group(2)
2485 if mask is None:
2486 mask = 8 * (m.group(1).count('.') + 1)
2487 else:
2488 mask = int(mask[1:])
2489 mask = 32 - mask
2490
2491 if (hostIP >> mask) == (base >> mask):
2492 return True
2493
2494 elif fnmatch(host, value):
2495 return True
2496
2497 return False
2498
2499
2500if sys.platform == 'darwin':
2501 from _scproxy import _get_proxy_settings, _get_proxies
2502
2503 def proxy_bypass_macosx_sysconf(host):
2504 proxy_settings = _get_proxy_settings()
2505 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2506
2507 def getproxies_macosx_sysconf():
2508 """Return a dictionary of scheme -> proxy server URL mappings.
2509
2510 This function uses the MacOSX framework SystemConfiguration
2511 to fetch the proxy information.
2512 """
2513 return _get_proxies()
2514
2515
2516
2517 def proxy_bypass(host):
2518 if getproxies_environment():
2519 return proxy_bypass_environment(host)
2520 else:
2521 return proxy_bypass_macosx_sysconf(host)
2522
2523 def getproxies():
2524 return getproxies_environment() or getproxies_macosx_sysconf()
2525
2526
2527elif os.name == 'nt':
2528 def getproxies_registry():
2529 """Return a dictionary of scheme -> proxy server URL mappings.
2530
2531 Win32 uses the registry to store proxies.
2532
2533 """
2534 proxies = {}
2535 try:
2536 import winreg
2537 except ImportError:
2538 # Std module, so should be around - but you never know!
2539 return proxies
2540 try:
2541 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2542 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2543 proxyEnable = winreg.QueryValueEx(internetSettings,
2544 'ProxyEnable')[0]
2545 if proxyEnable:
2546 # Returned as Unicode but problems if not converted to ASCII
2547 proxyServer = str(winreg.QueryValueEx(internetSettings,
2548 'ProxyServer')[0])
2549 if '=' in proxyServer:
2550 # Per-protocol settings
2551 for p in proxyServer.split(';'):
2552 protocol, address = p.split('=', 1)
2553 # See if address has a type:// prefix
2554 if not re.match('^([^/:]+)://', address):
2555 address = '%s://%s' % (protocol, address)
2556 proxies[protocol] = address
2557 else:
2558 # Use one setting for all protocols
2559 if proxyServer[:5] == 'http:':
2560 proxies['http'] = proxyServer
2561 else:
2562 proxies['http'] = 'http://%s' % proxyServer
2563 proxies['https'] = 'https://%s' % proxyServer
2564 proxies['ftp'] = 'ftp://%s' % proxyServer
2565 internetSettings.Close()
2566 except (WindowsError, ValueError, TypeError):
2567 # Either registry key not found etc, or the value in an
2568 # unexpected format.
2569 # proxies already set up to be empty so nothing to do
2570 pass
2571 return proxies
2572
2573 def getproxies():
2574 """Return a dictionary of scheme -> proxy server URL mappings.
2575
2576 Returns settings gathered from the environment, if specified,
2577 or the registry.
2578
2579 """
2580 return getproxies_environment() or getproxies_registry()
2581
2582 def proxy_bypass_registry(host):
2583 try:
2584 import winreg
2585 except ImportError:
2586 # Std modules, so should be around - but you never know!
2587 return 0
2588 try:
2589 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2590 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2591 proxyEnable = winreg.QueryValueEx(internetSettings,
2592 'ProxyEnable')[0]
2593 proxyOverride = str(winreg.QueryValueEx(internetSettings,
2594 'ProxyOverride')[0])
2595 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2596 except WindowsError:
2597 return 0
2598 if not proxyEnable or not proxyOverride:
2599 return 0
2600 # try to make a host list from name and IP address.
2601 rawHost, port = splitport(host)
2602 host = [rawHost]
2603 try:
2604 addr = socket.gethostbyname(rawHost)
2605 if addr != rawHost:
2606 host.append(addr)
2607 except socket.error:
2608 pass
2609 try:
2610 fqdn = socket.getfqdn(rawHost)
2611 if fqdn != rawHost:
2612 host.append(fqdn)
2613 except socket.error:
2614 pass
2615 # make a check value list from the registry entry: replace the
2616 # '<local>' string by the localhost entry and the corresponding
2617 # canonical entry.
2618 proxyOverride = proxyOverride.split(';')
2619 # now check if we match one of the registry values.
2620 for test in proxyOverride:
2621 if test == '<local>':
2622 if '.' not in rawHost:
2623 return 1
2624 test = test.replace(".", r"\.") # mask dots
2625 test = test.replace("*", r".*") # change glob sequence
2626 test = test.replace("?", r".") # change glob char
2627 for val in host:
2628 if re.match(test, val, re.I):
2629 return 1
2630 return 0
2631
2632 def proxy_bypass(host):
2633 """Return a dictionary of scheme -> proxy server URL mappings.
2634
2635 Returns settings gathered from the environment, if specified,
2636 or the registry.
2637
2638 """
2639 if getproxies_environment():
2640 return proxy_bypass_environment(host)
2641 else:
2642 return proxy_bypass_registry(host)
2643
2644else:
2645 # By default use environment variables
2646 getproxies = getproxies_environment
2647 proxy_bypass = proxy_bypass_environment