Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/urllib/request.py: 15%
1646 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:05 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:05 +0000
1"""An extensible library for opening URLs using a variety of protocols
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
30install_opener -- Installs a new opener as the default opener.
32objects of interest:
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
41BaseHandler --
43internals:
44BaseHandler and parent
45_call_chain conventions
47Example usage:
49import urllib.request
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54 uri='https://mahler:8092/site-updates.py',
55 user='klem',
56 passwd='geheim$parole')
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62 urllib.request.CacheFTPHandler)
64# install it
65urllib.request.install_opener(opener)
67f = urllib.request.urlopen('http://www.python.org/')
68"""
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled? The client needs to know the HTTP error code. But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
80# Possible extensions:
81# complex proxies XXX not sure what exactly was meant by this
82# abstract factory for opener
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104 urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105 _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107 unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
110# check for SSL
111try:
112 import ssl
113except ImportError:
114 _have_ssl = False
115else:
116 _have_ssl = True
118__all__ = [
119 # Classes
120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127 'UnknownHandler', 'HTTPErrorProcessor',
128 # Functions
129 'urlopen', 'install_opener', 'build_opener',
130 'pathname2url', 'url2pathname', 'getproxies',
131 # Legacy interface
132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140 *, cafile=None, capath=None, cadefault=False, context=None):
141 '''Open the URL url, which can be either a string or a Request object.
143 *data* must be an object specifying additional data to be sent to
144 the server, or None if no such data is needed. See Request for
145 details.
147 urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148 header in its HTTP requests.
150 The optional *timeout* parameter specifies a timeout in seconds for
151 blocking operations like the connection attempt (if not specified, the
152 global default timeout setting will be used). This only works for HTTP,
153 HTTPS and FTP connections.
155 If *context* is specified, it must be a ssl.SSLContext instance describing
156 the various SSL options. See HTTPSConnection for more details.
158 The optional *cafile* and *capath* parameters specify a set of trusted CA
159 certificates for HTTPS requests. cafile should point to a single file
160 containing a bundle of CA certificates, whereas capath should point to a
161 directory of hashed certificate files. More information can be found in
162 ssl.SSLContext.load_verify_locations().
164 The *cadefault* parameter is ignored.
167 This function always returns an object which can work as a
168 context manager and has the properties url, headers, and status.
169 See urllib.response.addinfourl for more detail on these properties.
171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
172 object slightly modified. In addition to the three new methods above, the
173 msg attribute contains the same information as the reason attribute ---
174 the reason phrase returned by the server --- instead of the response
175 headers as it is specified in the documentation for HTTPResponse.
177 For FTP, file, and data URLs and requests explicitly handled by legacy
178 URLopener and FancyURLopener classes, this function returns a
179 urllib.response.addinfourl object.
181 Note that None may be returned if no handler handles the request (though
182 the default installed global OpenerDirector uses UnknownHandler to ensure
183 this never happens).
185 In addition, if proxy settings are detected (for example, when a *_proxy
186 environment variable like http_proxy is set), ProxyHandler is default
187 installed and makes sure the requests are handled through the proxy.
189 '''
190 global _opener
191 if cafile or capath or cadefault:
192 import warnings
193 warnings.warn("cafile, capath and cadefault are deprecated, use a "
194 "custom context instead.", DeprecationWarning, 2)
195 if context is not None:
196 raise ValueError(
197 "You can't pass both context and any of cafile, capath, and "
198 "cadefault"
199 )
200 if not _have_ssl:
201 raise ValueError('SSL support not available')
202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
203 cafile=cafile,
204 capath=capath)
205 https_handler = HTTPSHandler(context=context)
206 opener = build_opener(https_handler)
207 elif context:
208 https_handler = HTTPSHandler(context=context)
209 opener = build_opener(https_handler)
210 elif _opener is None:
211 _opener = opener = build_opener()
212 else:
213 opener = _opener
214 return opener.open(url, data, timeout)
216def install_opener(opener):
217 global _opener
218 _opener = opener
220_url_tempfiles = []
221def urlretrieve(url, filename=None, reporthook=None, data=None):
222 """
223 Retrieve a URL into a temporary location on disk.
225 Requires a URL argument. If a filename is passed, it is used as
226 the temporary file location. The reporthook argument should be
227 a callable that accepts a block number, a read size, and the
228 total file size of the URL target. The data argument should be
229 valid URL encoded data.
231 If a filename is passed and the URL points to a local resource,
232 the result is a copy from local file to new file.
234 Returns a tuple containing the path to the newly created
235 data file as well as the resulting HTTPMessage object.
236 """
237 url_type, path = _splittype(url)
239 with contextlib.closing(urlopen(url, data)) as fp:
240 headers = fp.info()
242 # Just return the local path and the "headers" for file://
243 # URLs. No sense in performing a copy unless requested.
244 if url_type == "file" and not filename:
245 return os.path.normpath(path), headers
247 # Handle temporary file setup.
248 if filename:
249 tfp = open(filename, 'wb')
250 else:
251 tfp = tempfile.NamedTemporaryFile(delete=False)
252 filename = tfp.name
253 _url_tempfiles.append(filename)
255 with tfp:
256 result = filename, headers
257 bs = 1024*8
258 size = -1
259 read = 0
260 blocknum = 0
261 if "content-length" in headers:
262 size = int(headers["Content-Length"])
264 if reporthook:
265 reporthook(blocknum, bs, size)
267 while True:
268 block = fp.read(bs)
269 if not block:
270 break
271 read += len(block)
272 tfp.write(block)
273 blocknum += 1
274 if reporthook:
275 reporthook(blocknum, bs, size)
277 if size >= 0 and read < size:
278 raise ContentTooShortError(
279 "retrieval incomplete: got only %i out of %i bytes"
280 % (read, size), result)
282 return result
284def urlcleanup():
285 """Clean up temporary files from urlretrieve calls."""
286 for temp_file in _url_tempfiles:
287 try:
288 os.unlink(temp_file)
289 except OSError:
290 pass
292 del _url_tempfiles[:]
293 global _opener
294 if _opener:
295 _opener = None
297# copied from cookielib.py
298_cut_port_re = re.compile(r":\d+$", re.ASCII)
299def request_host(request):
300 """Return request-host, as defined by RFC 2965.
302 Variation from RFC: returned value is lowercased, for convenient
303 comparison.
305 """
306 url = request.full_url
307 host = urlparse(url)[1]
308 if host == "":
309 host = request.get_header("Host", "")
311 # remove port, if present
312 host = _cut_port_re.sub("", host, 1)
313 return host.lower()
315class Request:
317 def __init__(self, url, data=None, headers={},
318 origin_req_host=None, unverifiable=False,
319 method=None):
320 self.full_url = url
321 self.headers = {}
322 self.unredirected_hdrs = {}
323 self._data = None
324 self.data = data
325 self._tunnel_host = None
326 for key, value in headers.items():
327 self.add_header(key, value)
328 if origin_req_host is None:
329 origin_req_host = request_host(self)
330 self.origin_req_host = origin_req_host
331 self.unverifiable = unverifiable
332 if method:
333 self.method = method
335 @property
336 def full_url(self):
337 if self.fragment:
338 return '{}#{}'.format(self._full_url, self.fragment)
339 return self._full_url
341 @full_url.setter
342 def full_url(self, url):
343 # unwrap('<URL:type://host/path>') --> 'type://host/path'
344 self._full_url = unwrap(url)
345 self._full_url, self.fragment = _splittag(self._full_url)
346 self._parse()
348 @full_url.deleter
349 def full_url(self):
350 self._full_url = None
351 self.fragment = None
352 self.selector = ''
354 @property
355 def data(self):
356 return self._data
358 @data.setter
359 def data(self, data):
360 if data != self._data:
361 self._data = data
362 # issue 16464
363 # if we change data we need to remove content-length header
364 # (cause it's most probably calculated for previous value)
365 if self.has_header("Content-length"):
366 self.remove_header("Content-length")
368 @data.deleter
369 def data(self):
370 self.data = None
372 def _parse(self):
373 self.type, rest = _splittype(self._full_url)
374 if self.type is None:
375 raise ValueError("unknown url type: %r" % self.full_url)
376 self.host, self.selector = _splithost(rest)
377 if self.host:
378 self.host = unquote(self.host)
380 def get_method(self):
381 """Return a string indicating the HTTP request method."""
382 default_method = "POST" if self.data is not None else "GET"
383 return getattr(self, 'method', default_method)
385 def get_full_url(self):
386 return self.full_url
388 def set_proxy(self, host, type):
389 if self.type == 'https' and not self._tunnel_host:
390 self._tunnel_host = self.host
391 else:
392 self.type= type
393 self.selector = self.full_url
394 self.host = host
396 def has_proxy(self):
397 return self.selector == self.full_url
399 def add_header(self, key, val):
400 # useful for something like authentication
401 self.headers[key.capitalize()] = val
403 def add_unredirected_header(self, key, val):
404 # will not be added to a redirected request
405 self.unredirected_hdrs[key.capitalize()] = val
407 def has_header(self, header_name):
408 return (header_name in self.headers or
409 header_name in self.unredirected_hdrs)
411 def get_header(self, header_name, default=None):
412 return self.headers.get(
413 header_name,
414 self.unredirected_hdrs.get(header_name, default))
416 def remove_header(self, header_name):
417 self.headers.pop(header_name, None)
418 self.unredirected_hdrs.pop(header_name, None)
420 def header_items(self):
421 hdrs = {**self.unredirected_hdrs, **self.headers}
422 return list(hdrs.items())
424class OpenerDirector:
425 def __init__(self):
426 client_version = "Python-urllib/%s" % __version__
427 self.addheaders = [('User-agent', client_version)]
428 # self.handlers is retained only for backward compatibility
429 self.handlers = []
430 # manage the individual handlers
431 self.handle_open = {}
432 self.handle_error = {}
433 self.process_response = {}
434 self.process_request = {}
436 def add_handler(self, handler):
437 if not hasattr(handler, "add_parent"):
438 raise TypeError("expected BaseHandler instance, got %r" %
439 type(handler))
441 added = False
442 for meth in dir(handler):
443 if meth in ["redirect_request", "do_open", "proxy_open"]:
444 # oops, coincidental match
445 continue
447 i = meth.find("_")
448 protocol = meth[:i]
449 condition = meth[i+1:]
451 if condition.startswith("error"):
452 j = condition.find("_") + i + 1
453 kind = meth[j+1:]
454 try:
455 kind = int(kind)
456 except ValueError:
457 pass
458 lookup = self.handle_error.get(protocol, {})
459 self.handle_error[protocol] = lookup
460 elif condition == "open":
461 kind = protocol
462 lookup = self.handle_open
463 elif condition == "response":
464 kind = protocol
465 lookup = self.process_response
466 elif condition == "request":
467 kind = protocol
468 lookup = self.process_request
469 else:
470 continue
472 handlers = lookup.setdefault(kind, [])
473 if handlers:
474 bisect.insort(handlers, handler)
475 else:
476 handlers.append(handler)
477 added = True
479 if added:
480 bisect.insort(self.handlers, handler)
481 handler.add_parent(self)
483 def close(self):
484 # Only exists for backwards compatibility.
485 pass
487 def _call_chain(self, chain, kind, meth_name, *args):
488 # Handlers raise an exception if no one else should try to handle
489 # the request, or return None if they can't but another handler
490 # could. Otherwise, they return the response.
491 handlers = chain.get(kind, ())
492 for handler in handlers:
493 func = getattr(handler, meth_name)
494 result = func(*args)
495 if result is not None:
496 return result
498 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
499 # accept a URL or a Request object
500 if isinstance(fullurl, str):
501 req = Request(fullurl, data)
502 else:
503 req = fullurl
504 if data is not None:
505 req.data = data
507 req.timeout = timeout
508 protocol = req.type
510 # pre-process request
511 meth_name = protocol+"_request"
512 for processor in self.process_request.get(protocol, []):
513 meth = getattr(processor, meth_name)
514 req = meth(req)
516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
517 response = self._open(req, data)
519 # post-process response
520 meth_name = protocol+"_response"
521 for processor in self.process_response.get(protocol, []):
522 meth = getattr(processor, meth_name)
523 response = meth(req, response)
525 return response
527 def _open(self, req, data=None):
528 result = self._call_chain(self.handle_open, 'default',
529 'default_open', req)
530 if result:
531 return result
533 protocol = req.type
534 result = self._call_chain(self.handle_open, protocol, protocol +
535 '_open', req)
536 if result:
537 return result
539 return self._call_chain(self.handle_open, 'unknown',
540 'unknown_open', req)
542 def error(self, proto, *args):
543 if proto in ('http', 'https'):
544 # XXX http[s] protocols are special-cased
545 dict = self.handle_error['http'] # https is not different than http
546 proto = args[2] # YUCK!
547 meth_name = 'http_error_%s' % proto
548 http_err = 1
549 orig_args = args
550 else:
551 dict = self.handle_error
552 meth_name = proto + '_error'
553 http_err = 0
554 args = (dict, proto, meth_name) + args
555 result = self._call_chain(*args)
556 if result:
557 return result
559 if http_err:
560 args = (dict, 'default', 'http_error_default') + orig_args
561 return self._call_chain(*args)
563# XXX probably also want an abstract factory that knows when it makes
564# sense to skip a superclass in favor of a subclass and when it might
565# make sense to include both
567def build_opener(*handlers):
568 """Create an opener object from a list of handlers.
570 The opener will use several default handlers, including support
571 for HTTP, FTP and when applicable HTTPS.
573 If any of the handlers passed as arguments are subclasses of the
574 default handlers, the default handlers will not be used.
575 """
576 opener = OpenerDirector()
577 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
578 HTTPDefaultErrorHandler, HTTPRedirectHandler,
579 FTPHandler, FileHandler, HTTPErrorProcessor,
580 DataHandler]
581 if hasattr(http.client, "HTTPSConnection"):
582 default_classes.append(HTTPSHandler)
583 skip = set()
584 for klass in default_classes:
585 for check in handlers:
586 if isinstance(check, type):
587 if issubclass(check, klass):
588 skip.add(klass)
589 elif isinstance(check, klass):
590 skip.add(klass)
591 for klass in skip:
592 default_classes.remove(klass)
594 for klass in default_classes:
595 opener.add_handler(klass())
597 for h in handlers:
598 if isinstance(h, type):
599 h = h()
600 opener.add_handler(h)
601 return opener
603class BaseHandler:
604 handler_order = 500
606 def add_parent(self, parent):
607 self.parent = parent
609 def close(self):
610 # Only exists for backwards compatibility
611 pass
613 def __lt__(self, other):
614 if not hasattr(other, "handler_order"):
615 # Try to preserve the old behavior of having custom classes
616 # inserted after default ones (works only for custom user
617 # classes which are not aware of handler_order).
618 return True
619 return self.handler_order < other.handler_order
622class HTTPErrorProcessor(BaseHandler):
623 """Process HTTP error responses."""
624 handler_order = 1000 # after all other processing
626 def http_response(self, request, response):
627 code, msg, hdrs = response.code, response.msg, response.info()
629 # According to RFC 2616, "2xx" code indicates that the client's
630 # request was successfully received, understood, and accepted.
631 if not (200 <= code < 300):
632 response = self.parent.error(
633 'http', request, response, code, msg, hdrs)
635 return response
637 https_response = http_response
639class HTTPDefaultErrorHandler(BaseHandler):
640 def http_error_default(self, req, fp, code, msg, hdrs):
641 raise HTTPError(req.full_url, code, msg, hdrs, fp)
643class HTTPRedirectHandler(BaseHandler):
644 # maximum number of redirections to any single URL
645 # this is needed because of the state that cookies introduce
646 max_repeats = 4
647 # maximum total number of redirections (regardless of URL) before
648 # assuming we're in a loop
649 max_redirections = 10
651 def redirect_request(self, req, fp, code, msg, headers, newurl):
652 """Return a Request or None in response to a redirect.
654 This is called by the http_error_30x methods when a
655 redirection response is received. If a redirection should
656 take place, return a new Request to allow http_error_30x to
657 perform the redirect. Otherwise, raise HTTPError if no-one
658 else should try to handle this url. Return None if you can't
659 but another Handler might.
660 """
661 m = req.get_method()
662 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
663 or code in (301, 302, 303) and m == "POST")):
664 raise HTTPError(req.full_url, code, msg, headers, fp)
666 # Strictly (according to RFC 2616), 301 or 302 in response to
667 # a POST MUST NOT cause a redirection without confirmation
668 # from the user (of urllib.request, in this case). In practice,
669 # essentially all clients do redirect in this case, so we do
670 # the same.
672 # Be conciliant with URIs containing a space. This is mainly
673 # redundant with the more complete encoding done in http_error_302(),
674 # but it is kept for compatibility with other callers.
675 newurl = newurl.replace(' ', '%20')
677 CONTENT_HEADERS = ("content-length", "content-type")
678 newheaders = {k: v for k, v in req.headers.items()
679 if k.lower() not in CONTENT_HEADERS}
680 return Request(newurl,
681 headers=newheaders,
682 origin_req_host=req.origin_req_host,
683 unverifiable=True)
685 # Implementation note: To avoid the server sending us into an
686 # infinite loop, the request object needs to track what URLs we
687 # have already seen. Do this by adding a handler-specific
688 # attribute to the Request object.
689 def http_error_302(self, req, fp, code, msg, headers):
690 # Some servers (incorrectly) return multiple Location headers
691 # (so probably same goes for URI). Use first header.
692 if "location" in headers:
693 newurl = headers["location"]
694 elif "uri" in headers:
695 newurl = headers["uri"]
696 else:
697 return
699 # fix a possible malformed URL
700 urlparts = urlparse(newurl)
702 # For security reasons we don't allow redirection to anything other
703 # than http, https or ftp.
705 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
706 raise HTTPError(
707 newurl, code,
708 "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
709 headers, fp)
711 if not urlparts.path and urlparts.netloc:
712 urlparts = list(urlparts)
713 urlparts[2] = "/"
714 newurl = urlunparse(urlparts)
716 # http.client.parse_headers() decodes as ISO-8859-1. Recover the
717 # original bytes and percent-encode non-ASCII bytes, and any special
718 # characters such as the space.
719 newurl = quote(
720 newurl, encoding="iso-8859-1", safe=string.punctuation)
721 newurl = urljoin(req.full_url, newurl)
723 # XXX Probably want to forget about the state of the current
724 # request, although that might interact poorly with other
725 # handlers that also use handler-specific request attributes
726 new = self.redirect_request(req, fp, code, msg, headers, newurl)
727 if new is None:
728 return
730 # loop detection
731 # .redirect_dict has a key url if url was previously visited.
732 if hasattr(req, 'redirect_dict'):
733 visited = new.redirect_dict = req.redirect_dict
734 if (visited.get(newurl, 0) >= self.max_repeats or
735 len(visited) >= self.max_redirections):
736 raise HTTPError(req.full_url, code,
737 self.inf_msg + msg, headers, fp)
738 else:
739 visited = new.redirect_dict = req.redirect_dict = {}
740 visited[newurl] = visited.get(newurl, 0) + 1
742 # Don't close the fp until we are sure that we won't use it
743 # with HTTPError.
744 fp.read()
745 fp.close()
747 return self.parent.open(new, timeout=req.timeout)
749 http_error_301 = http_error_303 = http_error_307 = http_error_302
751 inf_msg = "The HTTP server returned a redirect error that would " \
752 "lead to an infinite loop.\n" \
753 "The last 30x error message was:\n"
756def _parse_proxy(proxy):
757 """Return (scheme, user, password, host/port) given a URL or an authority.
759 If a URL is supplied, it must have an authority (host:port) component.
760 According to RFC 3986, having an authority component means the URL must
761 have two slashes after the scheme.
762 """
763 scheme, r_scheme = _splittype(proxy)
764 if not r_scheme.startswith("/"):
765 # authority
766 scheme = None
767 authority = proxy
768 else:
769 # URL
770 if not r_scheme.startswith("//"):
771 raise ValueError("proxy URL with no authority: %r" % proxy)
772 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
773 # and 3.3.), path is empty or starts with '/'
774 if '@' in r_scheme:
775 host_separator = r_scheme.find('@')
776 end = r_scheme.find("/", host_separator)
777 else:
778 end = r_scheme.find("/", 2)
779 if end == -1:
780 end = None
781 authority = r_scheme[2:end]
782 userinfo, hostport = _splituser(authority)
783 if userinfo is not None:
784 user, password = _splitpasswd(userinfo)
785 else:
786 user = password = None
787 return scheme, user, password, hostport
789class ProxyHandler(BaseHandler):
790 # Proxies must be in front
791 handler_order = 100
793 def __init__(self, proxies=None):
794 if proxies is None:
795 proxies = getproxies()
796 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
797 self.proxies = proxies
798 for type, url in proxies.items():
799 type = type.lower()
800 setattr(self, '%s_open' % type,
801 lambda r, proxy=url, type=type, meth=self.proxy_open:
802 meth(r, proxy, type))
804 def proxy_open(self, req, proxy, type):
805 orig_type = req.type
806 proxy_type, user, password, hostport = _parse_proxy(proxy)
807 if proxy_type is None:
808 proxy_type = orig_type
810 if req.host and proxy_bypass(req.host):
811 return None
813 if user and password:
814 user_pass = '%s:%s' % (unquote(user),
815 unquote(password))
816 creds = base64.b64encode(user_pass.encode()).decode("ascii")
817 req.add_header('Proxy-authorization', 'Basic ' + creds)
818 hostport = unquote(hostport)
819 req.set_proxy(hostport, proxy_type)
820 if orig_type == proxy_type or orig_type == 'https':
821 # let other handlers take care of it
822 return None
823 else:
824 # need to start over, because the other handlers don't
825 # grok the proxy's URL type
826 # e.g. if we have a constructor arg proxies like so:
827 # {'http': 'ftp://proxy.example.com'}, we may end up turning
828 # a request for http://acme.example.com/a into one for
829 # ftp://proxy.example.com/a
830 return self.parent.open(req, timeout=req.timeout)
832class HTTPPasswordMgr:
834 def __init__(self):
835 self.passwd = {}
837 def add_password(self, realm, uri, user, passwd):
838 # uri could be a single URI or a sequence
839 if isinstance(uri, str):
840 uri = [uri]
841 if realm not in self.passwd:
842 self.passwd[realm] = {}
843 for default_port in True, False:
844 reduced_uri = tuple(
845 self.reduce_uri(u, default_port) for u in uri)
846 self.passwd[realm][reduced_uri] = (user, passwd)
848 def find_user_password(self, realm, authuri):
849 domains = self.passwd.get(realm, {})
850 for default_port in True, False:
851 reduced_authuri = self.reduce_uri(authuri, default_port)
852 for uris, authinfo in domains.items():
853 for uri in uris:
854 if self.is_suburi(uri, reduced_authuri):
855 return authinfo
856 return None, None
858 def reduce_uri(self, uri, default_port=True):
859 """Accept authority or URI and extract only the authority and path."""
860 # note HTTP URLs do not have a userinfo component
861 parts = urlsplit(uri)
862 if parts[1]:
863 # URI
864 scheme = parts[0]
865 authority = parts[1]
866 path = parts[2] or '/'
867 else:
868 # host or host:port
869 scheme = None
870 authority = uri
871 path = '/'
872 host, port = _splitport(authority)
873 if default_port and port is None and scheme is not None:
874 dport = {"http": 80,
875 "https": 443,
876 }.get(scheme)
877 if dport is not None:
878 authority = "%s:%d" % (host, dport)
879 return authority, path
881 def is_suburi(self, base, test):
882 """Check if test is below base in a URI tree
884 Both args must be URIs in reduced form.
885 """
886 if base == test:
887 return True
888 if base[0] != test[0]:
889 return False
890 common = posixpath.commonprefix((base[1], test[1]))
891 if len(common) == len(base[1]):
892 return True
893 return False
896class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
898 def find_user_password(self, realm, authuri):
899 user, password = HTTPPasswordMgr.find_user_password(self, realm,
900 authuri)
901 if user is not None:
902 return user, password
903 return HTTPPasswordMgr.find_user_password(self, None, authuri)
906class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
908 def __init__(self, *args, **kwargs):
909 self.authenticated = {}
910 super().__init__(*args, **kwargs)
912 def add_password(self, realm, uri, user, passwd, is_authenticated=False):
913 self.update_authenticated(uri, is_authenticated)
914 # Add a default for prior auth requests
915 if realm is not None:
916 super().add_password(None, uri, user, passwd)
917 super().add_password(realm, uri, user, passwd)
919 def update_authenticated(self, uri, is_authenticated=False):
920 # uri could be a single URI or a sequence
921 if isinstance(uri, str):
922 uri = [uri]
924 for default_port in True, False:
925 for u in uri:
926 reduced_uri = self.reduce_uri(u, default_port)
927 self.authenticated[reduced_uri] = is_authenticated
929 def is_authenticated(self, authuri):
930 for default_port in True, False:
931 reduced_authuri = self.reduce_uri(authuri, default_port)
932 for uri in self.authenticated:
933 if self.is_suburi(uri, reduced_authuri):
934 return self.authenticated[uri]
937class AbstractBasicAuthHandler:
939 # XXX this allows for multiple auth-schemes, but will stupidly pick
940 # the last one with a realm specified.
942 # allow for double- and single-quoted realm values
943 # (single quotes are a violation of the RFC, but appear in the wild)
944 rx = re.compile('(?:^|,)' # start of the string or ','
945 '[ \t]*' # optional whitespaces
946 '([^ \t,]+)' # scheme like "Basic"
947 '[ \t]+' # mandatory whitespaces
948 # realm=xxx
949 # realm='xxx'
950 # realm="xxx"
951 'realm=(["\']?)([^"\']*)\\2',
952 re.I)
954 # XXX could pre-emptively send auth info already accepted (RFC 2617,
955 # end of section 2, and section 1.2 immediately after "credentials"
956 # production).
958 def __init__(self, password_mgr=None):
959 if password_mgr is None:
960 password_mgr = HTTPPasswordMgr()
961 self.passwd = password_mgr
962 self.add_password = self.passwd.add_password
964 def _parse_realm(self, header):
965 # parse WWW-Authenticate header: accept multiple challenges per header
966 found_challenge = False
967 for mo in AbstractBasicAuthHandler.rx.finditer(header):
968 scheme, quote, realm = mo.groups()
969 if quote not in ['"', "'"]:
970 warnings.warn("Basic Auth Realm was unquoted",
971 UserWarning, 3)
973 yield (scheme, realm)
975 found_challenge = True
977 if not found_challenge:
978 if header:
979 scheme = header.split()[0]
980 else:
981 scheme = ''
982 yield (scheme, None)
984 def http_error_auth_reqed(self, authreq, host, req, headers):
985 # host may be an authority (without userinfo) or a URL with an
986 # authority
987 headers = headers.get_all(authreq)
988 if not headers:
989 # no header found
990 return
992 unsupported = None
993 for header in headers:
994 for scheme, realm in self._parse_realm(header):
995 if scheme.lower() != 'basic':
996 unsupported = scheme
997 continue
999 if realm is not None:
1000 # Use the first matching Basic challenge.
1001 # Ignore following challenges even if they use the Basic
1002 # scheme.
1003 return self.retry_http_basic_auth(host, req, realm)
1005 if unsupported is not None:
1006 raise ValueError("AbstractBasicAuthHandler does not "
1007 "support the following scheme: %r"
1008 % (scheme,))
1010 def retry_http_basic_auth(self, host, req, realm):
1011 user, pw = self.passwd.find_user_password(realm, host)
1012 if pw is not None:
1013 raw = "%s:%s" % (user, pw)
1014 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1015 if req.get_header(self.auth_header, None) == auth:
1016 return None
1017 req.add_unredirected_header(self.auth_header, auth)
1018 return self.parent.open(req, timeout=req.timeout)
1019 else:
1020 return None
1022 def http_request(self, req):
1023 if (not hasattr(self.passwd, 'is_authenticated') or
1024 not self.passwd.is_authenticated(req.full_url)):
1025 return req
1027 if not req.has_header('Authorization'):
1028 user, passwd = self.passwd.find_user_password(None, req.full_url)
1029 credentials = '{0}:{1}'.format(user, passwd).encode()
1030 auth_str = base64.standard_b64encode(credentials).decode()
1031 req.add_unredirected_header('Authorization',
1032 'Basic {}'.format(auth_str.strip()))
1033 return req
1035 def http_response(self, req, response):
1036 if hasattr(self.passwd, 'is_authenticated'):
1037 if 200 <= response.code < 300:
1038 self.passwd.update_authenticated(req.full_url, True)
1039 else:
1040 self.passwd.update_authenticated(req.full_url, False)
1041 return response
1043 https_request = http_request
1044 https_response = http_response
1048class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1050 auth_header = 'Authorization'
1052 def http_error_401(self, req, fp, code, msg, headers):
1053 url = req.full_url
1054 response = self.http_error_auth_reqed('www-authenticate',
1055 url, req, headers)
1056 return response
1059class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1061 auth_header = 'Proxy-authorization'
1063 def http_error_407(self, req, fp, code, msg, headers):
1064 # http_error_auth_reqed requires that there is no userinfo component in
1065 # authority. Assume there isn't one, since urllib.request does not (and
1066 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1067 # userinfo.
1068 authority = req.host
1069 response = self.http_error_auth_reqed('proxy-authenticate',
1070 authority, req, headers)
1071 return response
1074# Return n random bytes.
1075_randombytes = os.urandom
1078class AbstractDigestAuthHandler:
1079 # Digest authentication is specified in RFC 2617.
1081 # XXX The client does not inspect the Authentication-Info header
1082 # in a successful response.
1084 # XXX It should be possible to test this implementation against
1085 # a mock server that just generates a static set of challenges.
1087 # XXX qop="auth-int" supports is shaky
1089 def __init__(self, passwd=None):
1090 if passwd is None:
1091 passwd = HTTPPasswordMgr()
1092 self.passwd = passwd
1093 self.add_password = self.passwd.add_password
1094 self.retried = 0
1095 self.nonce_count = 0
1096 self.last_nonce = None
1098 def reset_retry_count(self):
1099 self.retried = 0
1101 def http_error_auth_reqed(self, auth_header, host, req, headers):
1102 authreq = headers.get(auth_header, None)
1103 if self.retried > 5:
1104 # Don't fail endlessly - if we failed once, we'll probably
1105 # fail a second time. Hm. Unless the Password Manager is
1106 # prompting for the information. Crap. This isn't great
1107 # but it's better than the current 'repeat until recursion
1108 # depth exceeded' approach <wink>
1109 raise HTTPError(req.full_url, 401, "digest auth failed",
1110 headers, None)
1111 else:
1112 self.retried += 1
1113 if authreq:
1114 scheme = authreq.split()[0]
1115 if scheme.lower() == 'digest':
1116 return self.retry_http_digest_auth(req, authreq)
1117 elif scheme.lower() != 'basic':
1118 raise ValueError("AbstractDigestAuthHandler does not support"
1119 " the following scheme: '%s'" % scheme)
1121 def retry_http_digest_auth(self, req, auth):
1122 token, challenge = auth.split(' ', 1)
1123 chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1124 auth = self.get_authorization(req, chal)
1125 if auth:
1126 auth_val = 'Digest %s' % auth
1127 if req.headers.get(self.auth_header, None) == auth_val:
1128 return None
1129 req.add_unredirected_header(self.auth_header, auth_val)
1130 resp = self.parent.open(req, timeout=req.timeout)
1131 return resp
1133 def get_cnonce(self, nonce):
1134 # The cnonce-value is an opaque
1135 # quoted string value provided by the client and used by both client
1136 # and server to avoid chosen plaintext attacks, to provide mutual
1137 # authentication, and to provide some message integrity protection.
1138 # This isn't a fabulous effort, but it's probably Good Enough.
1139 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1140 b = s.encode("ascii") + _randombytes(8)
1141 dig = hashlib.sha1(b).hexdigest()
1142 return dig[:16]
1144 def get_authorization(self, req, chal):
1145 try:
1146 realm = chal['realm']
1147 nonce = chal['nonce']
1148 qop = chal.get('qop')
1149 algorithm = chal.get('algorithm', 'MD5')
1150 # mod_digest doesn't send an opaque, even though it isn't
1151 # supposed to be optional
1152 opaque = chal.get('opaque', None)
1153 except KeyError:
1154 return None
1156 H, KD = self.get_algorithm_impls(algorithm)
1157 if H is None:
1158 return None
1160 user, pw = self.passwd.find_user_password(realm, req.full_url)
1161 if user is None:
1162 return None
1164 # XXX not implemented yet
1165 if req.data is not None:
1166 entdig = self.get_entity_digest(req.data, chal)
1167 else:
1168 entdig = None
1170 A1 = "%s:%s:%s" % (user, realm, pw)
1171 A2 = "%s:%s" % (req.get_method(),
1172 # XXX selector: what about proxies and full urls
1173 req.selector)
1174 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1175 # or `auth-int` to the response back. we use `auth` to send the response back.
1176 if qop is None:
1177 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1178 elif 'auth' in qop.split(','):
1179 if nonce == self.last_nonce:
1180 self.nonce_count += 1
1181 else:
1182 self.nonce_count = 1
1183 self.last_nonce = nonce
1184 ncvalue = '%08x' % self.nonce_count
1185 cnonce = self.get_cnonce(nonce)
1186 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1187 respdig = KD(H(A1), noncebit)
1188 else:
1189 # XXX handle auth-int.
1190 raise URLError("qop '%s' is not supported." % qop)
1192 # XXX should the partial digests be encoded too?
1194 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1195 'response="%s"' % (user, realm, nonce, req.selector,
1196 respdig)
1197 if opaque:
1198 base += ', opaque="%s"' % opaque
1199 if entdig:
1200 base += ', digest="%s"' % entdig
1201 base += ', algorithm="%s"' % algorithm
1202 if qop:
1203 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1204 return base
1206 def get_algorithm_impls(self, algorithm):
1207 # lambdas assume digest modules are imported at the top level
1208 if algorithm == 'MD5':
1209 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1210 elif algorithm == 'SHA':
1211 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1212 # XXX MD5-sess
1213 else:
1214 raise ValueError("Unsupported digest authentication "
1215 "algorithm %r" % algorithm)
1216 KD = lambda s, d: H("%s:%s" % (s, d))
1217 return H, KD
1219 def get_entity_digest(self, data, chal):
1220 # XXX not implemented yet
1221 return None
1224class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1225 """An authentication protocol defined by RFC 2069
1227 Digest authentication improves on basic authentication because it
1228 does not transmit passwords in the clear.
1229 """
1231 auth_header = 'Authorization'
1232 handler_order = 490 # before Basic auth
1234 def http_error_401(self, req, fp, code, msg, headers):
1235 host = urlparse(req.full_url)[1]
1236 retry = self.http_error_auth_reqed('www-authenticate',
1237 host, req, headers)
1238 self.reset_retry_count()
1239 return retry
1242class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1244 auth_header = 'Proxy-Authorization'
1245 handler_order = 490 # before Basic auth
1247 def http_error_407(self, req, fp, code, msg, headers):
1248 host = req.host
1249 retry = self.http_error_auth_reqed('proxy-authenticate',
1250 host, req, headers)
1251 self.reset_retry_count()
1252 return retry
1254class AbstractHTTPHandler(BaseHandler):
1256 def __init__(self, debuglevel=0):
1257 self._debuglevel = debuglevel
1259 def set_http_debuglevel(self, level):
1260 self._debuglevel = level
1262 def _get_content_length(self, request):
1263 return http.client.HTTPConnection._get_content_length(
1264 request.data,
1265 request.get_method())
1267 def do_request_(self, request):
1268 host = request.host
1269 if not host:
1270 raise URLError('no host given')
1272 if request.data is not None: # POST
1273 data = request.data
1274 if isinstance(data, str):
1275 msg = "POST data should be bytes, an iterable of bytes, " \
1276 "or a file object. It cannot be of type str."
1277 raise TypeError(msg)
1278 if not request.has_header('Content-type'):
1279 request.add_unredirected_header(
1280 'Content-type',
1281 'application/x-www-form-urlencoded')
1282 if (not request.has_header('Content-length')
1283 and not request.has_header('Transfer-encoding')):
1284 content_length = self._get_content_length(request)
1285 if content_length is not None:
1286 request.add_unredirected_header(
1287 'Content-length', str(content_length))
1288 else:
1289 request.add_unredirected_header(
1290 'Transfer-encoding', 'chunked')
1292 sel_host = host
1293 if request.has_proxy():
1294 scheme, sel = _splittype(request.selector)
1295 sel_host, sel_path = _splithost(sel)
1296 if not request.has_header('Host'):
1297 request.add_unredirected_header('Host', sel_host)
1298 for name, value in self.parent.addheaders:
1299 name = name.capitalize()
1300 if not request.has_header(name):
1301 request.add_unredirected_header(name, value)
1303 return request
1305 def do_open(self, http_class, req, **http_conn_args):
1306 """Return an HTTPResponse object for the request, using http_class.
1308 http_class must implement the HTTPConnection API from http.client.
1309 """
1310 host = req.host
1311 if not host:
1312 raise URLError('no host given')
1314 # will parse host:port
1315 h = http_class(host, timeout=req.timeout, **http_conn_args)
1316 h.set_debuglevel(self._debuglevel)
1318 headers = dict(req.unredirected_hdrs)
1319 headers.update({k: v for k, v in req.headers.items()
1320 if k not in headers})
1322 # TODO(jhylton): Should this be redesigned to handle
1323 # persistent connections?
1325 # We want to make an HTTP/1.1 request, but the addinfourl
1326 # class isn't prepared to deal with a persistent connection.
1327 # It will try to read all remaining data from the socket,
1328 # which will block while the server waits for the next request.
1329 # So make sure the connection gets closed after the (only)
1330 # request.
1331 headers["Connection"] = "close"
1332 headers = {name.title(): val for name, val in headers.items()}
1334 if req._tunnel_host:
1335 tunnel_headers = {}
1336 proxy_auth_hdr = "Proxy-Authorization"
1337 if proxy_auth_hdr in headers:
1338 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1339 # Proxy-Authorization should not be sent to origin
1340 # server.
1341 del headers[proxy_auth_hdr]
1342 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1344 try:
1345 try:
1346 h.request(req.get_method(), req.selector, req.data, headers,
1347 encode_chunked=req.has_header('Transfer-encoding'))
1348 except OSError as err: # timeout error
1349 raise URLError(err)
1350 r = h.getresponse()
1351 except:
1352 h.close()
1353 raise
1355 # If the server does not send us a 'Connection: close' header,
1356 # HTTPConnection assumes the socket should be left open. Manually
1357 # mark the socket to be closed when this response object goes away.
1358 if h.sock:
1359 h.sock.close()
1360 h.sock = None
1362 r.url = req.get_full_url()
1363 # This line replaces the .msg attribute of the HTTPResponse
1364 # with .headers, because urllib clients expect the response to
1365 # have the reason in .msg. It would be good to mark this
1366 # attribute is deprecated and get then to use info() or
1367 # .headers.
1368 r.msg = r.reason
1369 return r
1372class HTTPHandler(AbstractHTTPHandler):
1374 def http_open(self, req):
1375 return self.do_open(http.client.HTTPConnection, req)
1377 http_request = AbstractHTTPHandler.do_request_
1379if hasattr(http.client, 'HTTPSConnection'):
1381 class HTTPSHandler(AbstractHTTPHandler):
1383 def __init__(self, debuglevel=0, context=None, check_hostname=None):
1384 AbstractHTTPHandler.__init__(self, debuglevel)
1385 self._context = context
1386 self._check_hostname = check_hostname
1388 def https_open(self, req):
1389 return self.do_open(http.client.HTTPSConnection, req,
1390 context=self._context, check_hostname=self._check_hostname)
1392 https_request = AbstractHTTPHandler.do_request_
1394 __all__.append('HTTPSHandler')
1396class HTTPCookieProcessor(BaseHandler):
1397 def __init__(self, cookiejar=None):
1398 import http.cookiejar
1399 if cookiejar is None:
1400 cookiejar = http.cookiejar.CookieJar()
1401 self.cookiejar = cookiejar
1403 def http_request(self, request):
1404 self.cookiejar.add_cookie_header(request)
1405 return request
1407 def http_response(self, request, response):
1408 self.cookiejar.extract_cookies(response, request)
1409 return response
1411 https_request = http_request
1412 https_response = http_response
1414class UnknownHandler(BaseHandler):
1415 def unknown_open(self, req):
1416 type = req.type
1417 raise URLError('unknown url type: %s' % type)
1419def parse_keqv_list(l):
1420 """Parse list of key=value strings where keys are not duplicated."""
1421 parsed = {}
1422 for elt in l:
1423 k, v = elt.split('=', 1)
1424 if v[0] == '"' and v[-1] == '"':
1425 v = v[1:-1]
1426 parsed[k] = v
1427 return parsed
1429def parse_http_list(s):
1430 """Parse lists as described by RFC 2068 Section 2.
1432 In particular, parse comma-separated lists where the elements of
1433 the list may include quoted-strings. A quoted-string could
1434 contain a comma. A non-quoted string could have quotes in the
1435 middle. Neither commas nor quotes count if they are escaped.
1436 Only double-quotes count, not single-quotes.
1437 """
1438 res = []
1439 part = ''
1441 escape = quote = False
1442 for cur in s:
1443 if escape:
1444 part += cur
1445 escape = False
1446 continue
1447 if quote:
1448 if cur == '\\':
1449 escape = True
1450 continue
1451 elif cur == '"':
1452 quote = False
1453 part += cur
1454 continue
1456 if cur == ',':
1457 res.append(part)
1458 part = ''
1459 continue
1461 if cur == '"':
1462 quote = True
1464 part += cur
1466 # append last part
1467 if part:
1468 res.append(part)
1470 return [part.strip() for part in res]
1472class FileHandler(BaseHandler):
1473 # Use local file or FTP depending on form of URL
1474 def file_open(self, req):
1475 url = req.selector
1476 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1477 req.host != 'localhost'):
1478 if not req.host in self.get_names():
1479 raise URLError("file:// scheme is supported only on localhost")
1480 else:
1481 return self.open_local_file(req)
1483 # names for the localhost
1484 names = None
1485 def get_names(self):
1486 if FileHandler.names is None:
1487 try:
1488 FileHandler.names = tuple(
1489 socket.gethostbyname_ex('localhost')[2] +
1490 socket.gethostbyname_ex(socket.gethostname())[2])
1491 except socket.gaierror:
1492 FileHandler.names = (socket.gethostbyname('localhost'),)
1493 return FileHandler.names
1495 # not entirely sure what the rules are here
1496 def open_local_file(self, req):
1497 import email.utils
1498 import mimetypes
1499 host = req.host
1500 filename = req.selector
1501 localfile = url2pathname(filename)
1502 try:
1503 stats = os.stat(localfile)
1504 size = stats.st_size
1505 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1506 mtype = mimetypes.guess_type(filename)[0]
1507 headers = email.message_from_string(
1508 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1509 (mtype or 'text/plain', size, modified))
1510 if host:
1511 host, port = _splitport(host)
1512 if not host or \
1513 (not port and _safe_gethostbyname(host) in self.get_names()):
1514 if host:
1515 origurl = 'file://' + host + filename
1516 else:
1517 origurl = 'file://' + filename
1518 return addinfourl(open(localfile, 'rb'), headers, origurl)
1519 except OSError as exp:
1520 raise URLError(exp)
1521 raise URLError('file not on local host')
1523def _safe_gethostbyname(host):
1524 try:
1525 return socket.gethostbyname(host)
1526 except socket.gaierror:
1527 return None
1529class FTPHandler(BaseHandler):
1530 def ftp_open(self, req):
1531 import ftplib
1532 import mimetypes
1533 host = req.host
1534 if not host:
1535 raise URLError('ftp error: no host given')
1536 host, port = _splitport(host)
1537 if port is None:
1538 port = ftplib.FTP_PORT
1539 else:
1540 port = int(port)
1542 # username/password handling
1543 user, host = _splituser(host)
1544 if user:
1545 user, passwd = _splitpasswd(user)
1546 else:
1547 passwd = None
1548 host = unquote(host)
1549 user = user or ''
1550 passwd = passwd or ''
1552 try:
1553 host = socket.gethostbyname(host)
1554 except OSError as msg:
1555 raise URLError(msg)
1556 path, attrs = _splitattr(req.selector)
1557 dirs = path.split('/')
1558 dirs = list(map(unquote, dirs))
1559 dirs, file = dirs[:-1], dirs[-1]
1560 if dirs and not dirs[0]:
1561 dirs = dirs[1:]
1562 try:
1563 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1564 type = file and 'I' or 'D'
1565 for attr in attrs:
1566 attr, value = _splitvalue(attr)
1567 if attr.lower() == 'type' and \
1568 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1569 type = value.upper()
1570 fp, retrlen = fw.retrfile(file, type)
1571 headers = ""
1572 mtype = mimetypes.guess_type(req.full_url)[0]
1573 if mtype:
1574 headers += "Content-type: %s\n" % mtype
1575 if retrlen is not None and retrlen >= 0:
1576 headers += "Content-length: %d\n" % retrlen
1577 headers = email.message_from_string(headers)
1578 return addinfourl(fp, headers, req.full_url)
1579 except ftplib.all_errors as exp:
1580 exc = URLError('ftp error: %r' % exp)
1581 raise exc.with_traceback(sys.exc_info()[2])
1583 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1584 return ftpwrapper(user, passwd, host, port, dirs, timeout,
1585 persistent=False)
1587class CacheFTPHandler(FTPHandler):
1588 # XXX would be nice to have pluggable cache strategies
1589 # XXX this stuff is definitely not thread safe
1590 def __init__(self):
1591 self.cache = {}
1592 self.timeout = {}
1593 self.soonest = 0
1594 self.delay = 60
1595 self.max_conns = 16
1597 def setTimeout(self, t):
1598 self.delay = t
1600 def setMaxConns(self, m):
1601 self.max_conns = m
1603 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1604 key = user, host, port, '/'.join(dirs), timeout
1605 if key in self.cache:
1606 self.timeout[key] = time.time() + self.delay
1607 else:
1608 self.cache[key] = ftpwrapper(user, passwd, host, port,
1609 dirs, timeout)
1610 self.timeout[key] = time.time() + self.delay
1611 self.check_cache()
1612 return self.cache[key]
1614 def check_cache(self):
1615 # first check for old ones
1616 t = time.time()
1617 if self.soonest <= t:
1618 for k, v in list(self.timeout.items()):
1619 if v < t:
1620 self.cache[k].close()
1621 del self.cache[k]
1622 del self.timeout[k]
1623 self.soonest = min(list(self.timeout.values()))
1625 # then check the size
1626 if len(self.cache) == self.max_conns:
1627 for k, v in list(self.timeout.items()):
1628 if v == self.soonest:
1629 del self.cache[k]
1630 del self.timeout[k]
1631 break
1632 self.soonest = min(list(self.timeout.values()))
1634 def clear_cache(self):
1635 for conn in self.cache.values():
1636 conn.close()
1637 self.cache.clear()
1638 self.timeout.clear()
1640class DataHandler(BaseHandler):
1641 def data_open(self, req):
1642 # data URLs as specified in RFC 2397.
1643 #
1644 # ignores POSTed data
1645 #
1646 # syntax:
1647 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
1648 # mediatype := [ type "/" subtype ] *( ";" parameter )
1649 # data := *urlchar
1650 # parameter := attribute "=" value
1651 url = req.full_url
1653 scheme, data = url.split(":",1)
1654 mediatype, data = data.split(",",1)
1656 # even base64 encoded data URLs might be quoted so unquote in any case:
1657 data = unquote_to_bytes(data)
1658 if mediatype.endswith(";base64"):
1659 data = base64.decodebytes(data)
1660 mediatype = mediatype[:-7]
1662 if not mediatype:
1663 mediatype = "text/plain;charset=US-ASCII"
1665 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1666 (mediatype, len(data)))
1668 return addinfourl(io.BytesIO(data), headers, url)
1671# Code move from the old urllib module
1673MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
1675# Helper for non-unix systems
1676if os.name == 'nt':
1677 from nturl2path import url2pathname, pathname2url
1678else:
1679 def url2pathname(pathname):
1680 """OS-specific conversion from a relative URL of the 'file' scheme
1681 to a file system path; not recommended for general use."""
1682 return unquote(pathname)
1684 def pathname2url(pathname):
1685 """OS-specific conversion from a file system path to a relative URL
1686 of the 'file' scheme; not recommended for general use."""
1687 return quote(pathname)
1690ftpcache = {}
1693class URLopener:
1694 """Class to open URLs.
1695 This is a class rather than just a subroutine because we may need
1696 more than one set of global protocol-specific options.
1697 Note -- this is a base class for those who don't want the
1698 automatic handling of errors type 302 (relocated) and 401
1699 (authorization needed)."""
1701 __tempfiles = None
1703 version = "Python-urllib/%s" % __version__
1705 # Constructor
1706 def __init__(self, proxies=None, **x509):
1707 msg = "%(class)s style of invoking requests is deprecated. " \
1708 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1709 warnings.warn(msg, DeprecationWarning, stacklevel=3)
1710 if proxies is None:
1711 proxies = getproxies()
1712 assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1713 self.proxies = proxies
1714 self.key_file = x509.get('key_file')
1715 self.cert_file = x509.get('cert_file')
1716 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1717 self.__tempfiles = []
1718 self.__unlink = os.unlink # See cleanup()
1719 self.tempcache = None
1720 # Undocumented feature: if you assign {} to tempcache,
1721 # it is used to cache files retrieved with
1722 # self.retrieve(). This is not enabled by default
1723 # since it does not work for changing documents (and I
1724 # haven't got the logic to check expiration headers
1725 # yet).
1726 self.ftpcache = ftpcache
1727 # Undocumented feature: you can use a different
1728 # ftp cache by assigning to the .ftpcache member;
1729 # in case you want logically independent URL openers
1730 # XXX This is not threadsafe. Bah.
1732 def __del__(self):
1733 self.close()
1735 def close(self):
1736 self.cleanup()
1738 def cleanup(self):
1739 # This code sometimes runs when the rest of this module
1740 # has already been deleted, so it can't use any globals
1741 # or import anything.
1742 if self.__tempfiles:
1743 for file in self.__tempfiles:
1744 try:
1745 self.__unlink(file)
1746 except OSError:
1747 pass
1748 del self.__tempfiles[:]
1749 if self.tempcache:
1750 self.tempcache.clear()
1752 def addheader(self, *args):
1753 """Add a header to be used by the HTTP interface only
1754 e.g. u.addheader('Accept', 'sound/basic')"""
1755 self.addheaders.append(args)
1757 # External interface
1758 def open(self, fullurl, data=None):
1759 """Use URLopener().open(file) instead of open(file, 'r')."""
1760 fullurl = unwrap(_to_bytes(fullurl))
1761 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1762 if self.tempcache and fullurl in self.tempcache:
1763 filename, headers = self.tempcache[fullurl]
1764 fp = open(filename, 'rb')
1765 return addinfourl(fp, headers, fullurl)
1766 urltype, url = _splittype(fullurl)
1767 if not urltype:
1768 urltype = 'file'
1769 if urltype in self.proxies:
1770 proxy = self.proxies[urltype]
1771 urltype, proxyhost = _splittype(proxy)
1772 host, selector = _splithost(proxyhost)
1773 url = (host, fullurl) # Signal special case to open_*()
1774 else:
1775 proxy = None
1776 name = 'open_' + urltype
1777 self.type = urltype
1778 name = name.replace('-', '_')
1779 if not hasattr(self, name) or name == 'open_local_file':
1780 if proxy:
1781 return self.open_unknown_proxy(proxy, fullurl, data)
1782 else:
1783 return self.open_unknown(fullurl, data)
1784 try:
1785 if data is None:
1786 return getattr(self, name)(url)
1787 else:
1788 return getattr(self, name)(url, data)
1789 except (HTTPError, URLError):
1790 raise
1791 except OSError as msg:
1792 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1794 def open_unknown(self, fullurl, data=None):
1795 """Overridable interface to open unknown URL type."""
1796 type, url = _splittype(fullurl)
1797 raise OSError('url error', 'unknown url type', type)
1799 def open_unknown_proxy(self, proxy, fullurl, data=None):
1800 """Overridable interface to open unknown URL type."""
1801 type, url = _splittype(fullurl)
1802 raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1804 # External interface
1805 def retrieve(self, url, filename=None, reporthook=None, data=None):
1806 """retrieve(url) returns (filename, headers) for a local object
1807 or (tempfilename, headers) for a remote object."""
1808 url = unwrap(_to_bytes(url))
1809 if self.tempcache and url in self.tempcache:
1810 return self.tempcache[url]
1811 type, url1 = _splittype(url)
1812 if filename is None and (not type or type == 'file'):
1813 try:
1814 fp = self.open_local_file(url1)
1815 hdrs = fp.info()
1816 fp.close()
1817 return url2pathname(_splithost(url1)[1]), hdrs
1818 except OSError:
1819 pass
1820 fp = self.open(url, data)
1821 try:
1822 headers = fp.info()
1823 if filename:
1824 tfp = open(filename, 'wb')
1825 else:
1826 garbage, path = _splittype(url)
1827 garbage, path = _splithost(path or "")
1828 path, garbage = _splitquery(path or "")
1829 path, garbage = _splitattr(path or "")
1830 suffix = os.path.splitext(path)[1]
1831 (fd, filename) = tempfile.mkstemp(suffix)
1832 self.__tempfiles.append(filename)
1833 tfp = os.fdopen(fd, 'wb')
1834 try:
1835 result = filename, headers
1836 if self.tempcache is not None:
1837 self.tempcache[url] = result
1838 bs = 1024*8
1839 size = -1
1840 read = 0
1841 blocknum = 0
1842 if "content-length" in headers:
1843 size = int(headers["Content-Length"])
1844 if reporthook:
1845 reporthook(blocknum, bs, size)
1846 while 1:
1847 block = fp.read(bs)
1848 if not block:
1849 break
1850 read += len(block)
1851 tfp.write(block)
1852 blocknum += 1
1853 if reporthook:
1854 reporthook(blocknum, bs, size)
1855 finally:
1856 tfp.close()
1857 finally:
1858 fp.close()
1860 # raise exception if actual size does not match content-length header
1861 if size >= 0 and read < size:
1862 raise ContentTooShortError(
1863 "retrieval incomplete: got only %i out of %i bytes"
1864 % (read, size), result)
1866 return result
1868 # Each method named open_<type> knows how to open that type of URL
1870 def _open_generic_http(self, connection_factory, url, data):
1871 """Make an HTTP connection using connection_class.
1873 This is an internal method that should be called from
1874 open_http() or open_https().
1876 Arguments:
1877 - connection_factory should take a host name and return an
1878 HTTPConnection instance.
1879 - url is the url to retrieval or a host, relative-path pair.
1880 - data is payload for a POST request or None.
1881 """
1883 user_passwd = None
1884 proxy_passwd= None
1885 if isinstance(url, str):
1886 host, selector = _splithost(url)
1887 if host:
1888 user_passwd, host = _splituser(host)
1889 host = unquote(host)
1890 realhost = host
1891 else:
1892 host, selector = url
1893 # check whether the proxy contains authorization information
1894 proxy_passwd, host = _splituser(host)
1895 # now we proceed with the url we want to obtain
1896 urltype, rest = _splittype(selector)
1897 url = rest
1898 user_passwd = None
1899 if urltype.lower() != 'http':
1900 realhost = None
1901 else:
1902 realhost, rest = _splithost(rest)
1903 if realhost:
1904 user_passwd, realhost = _splituser(realhost)
1905 if user_passwd:
1906 selector = "%s://%s%s" % (urltype, realhost, rest)
1907 if proxy_bypass(realhost):
1908 host = realhost
1910 if not host: raise OSError('http error', 'no host given')
1912 if proxy_passwd:
1913 proxy_passwd = unquote(proxy_passwd)
1914 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1915 else:
1916 proxy_auth = None
1918 if user_passwd:
1919 user_passwd = unquote(user_passwd)
1920 auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1921 else:
1922 auth = None
1923 http_conn = connection_factory(host)
1924 headers = {}
1925 if proxy_auth:
1926 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1927 if auth:
1928 headers["Authorization"] = "Basic %s" % auth
1929 if realhost:
1930 headers["Host"] = realhost
1932 # Add Connection:close as we don't support persistent connections yet.
1933 # This helps in closing the socket and avoiding ResourceWarning
1935 headers["Connection"] = "close"
1937 for header, value in self.addheaders:
1938 headers[header] = value
1940 if data is not None:
1941 headers["Content-Type"] = "application/x-www-form-urlencoded"
1942 http_conn.request("POST", selector, data, headers)
1943 else:
1944 http_conn.request("GET", selector, headers=headers)
1946 try:
1947 response = http_conn.getresponse()
1948 except http.client.BadStatusLine:
1949 # something went wrong with the HTTP status line
1950 raise URLError("http protocol error: bad status line")
1952 # According to RFC 2616, "2xx" code indicates that the client's
1953 # request was successfully received, understood, and accepted.
1954 if 200 <= response.status < 300:
1955 return addinfourl(response, response.msg, "http:" + url,
1956 response.status)
1957 else:
1958 return self.http_error(
1959 url, response.fp,
1960 response.status, response.reason, response.msg, data)
1962 def open_http(self, url, data=None):
1963 """Use HTTP protocol."""
1964 return self._open_generic_http(http.client.HTTPConnection, url, data)
1966 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1967 """Handle http errors.
1969 Derived class can override this, or provide specific handlers
1970 named http_error_DDD where DDD is the 3-digit error code."""
1971 # First check if there's a specific handler for this error
1972 name = 'http_error_%d' % errcode
1973 if hasattr(self, name):
1974 method = getattr(self, name)
1975 if data is None:
1976 result = method(url, fp, errcode, errmsg, headers)
1977 else:
1978 result = method(url, fp, errcode, errmsg, headers, data)
1979 if result: return result
1980 return self.http_error_default(url, fp, errcode, errmsg, headers)
1982 def http_error_default(self, url, fp, errcode, errmsg, headers):
1983 """Default error handler: close the connection and raise OSError."""
1984 fp.close()
1985 raise HTTPError(url, errcode, errmsg, headers, None)
1987 if _have_ssl:
1988 def _https_connection(self, host):
1989 return http.client.HTTPSConnection(host,
1990 key_file=self.key_file,
1991 cert_file=self.cert_file)
1993 def open_https(self, url, data=None):
1994 """Use HTTPS protocol."""
1995 return self._open_generic_http(self._https_connection, url, data)
1997 def open_file(self, url):
1998 """Use local file or FTP depending on form of URL."""
1999 if not isinstance(url, str):
2000 raise URLError('file error: proxy support for file protocol currently not implemented')
2001 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
2002 raise ValueError("file:// scheme is supported only on localhost")
2003 else:
2004 return self.open_local_file(url)
2006 def open_local_file(self, url):
2007 """Use local file."""
2008 import email.utils
2009 import mimetypes
2010 host, file = _splithost(url)
2011 localname = url2pathname(file)
2012 try:
2013 stats = os.stat(localname)
2014 except OSError as e:
2015 raise URLError(e.strerror, e.filename)
2016 size = stats.st_size
2017 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2018 mtype = mimetypes.guess_type(url)[0]
2019 headers = email.message_from_string(
2020 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2021 (mtype or 'text/plain', size, modified))
2022 if not host:
2023 urlfile = file
2024 if file[:1] == '/':
2025 urlfile = 'file://' + file
2026 return addinfourl(open(localname, 'rb'), headers, urlfile)
2027 host, port = _splitport(host)
2028 if (not port
2029 and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2030 urlfile = file
2031 if file[:1] == '/':
2032 urlfile = 'file://' + file
2033 elif file[:2] == './':
2034 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2035 return addinfourl(open(localname, 'rb'), headers, urlfile)
2036 raise URLError('local file error: not on local host')
2038 def open_ftp(self, url):
2039 """Use FTP protocol."""
2040 if not isinstance(url, str):
2041 raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2042 import mimetypes
2043 host, path = _splithost(url)
2044 if not host: raise URLError('ftp error: no host given')
2045 host, port = _splitport(host)
2046 user, host = _splituser(host)
2047 if user: user, passwd = _splitpasswd(user)
2048 else: passwd = None
2049 host = unquote(host)
2050 user = unquote(user or '')
2051 passwd = unquote(passwd or '')
2052 host = socket.gethostbyname(host)
2053 if not port:
2054 import ftplib
2055 port = ftplib.FTP_PORT
2056 else:
2057 port = int(port)
2058 path, attrs = _splitattr(path)
2059 path = unquote(path)
2060 dirs = path.split('/')
2061 dirs, file = dirs[:-1], dirs[-1]
2062 if dirs and not dirs[0]: dirs = dirs[1:]
2063 if dirs and not dirs[0]: dirs[0] = '/'
2064 key = user, host, port, '/'.join(dirs)
2065 # XXX thread unsafe!
2066 if len(self.ftpcache) > MAXFTPCACHE:
2067 # Prune the cache, rather arbitrarily
2068 for k in list(self.ftpcache):
2069 if k != key:
2070 v = self.ftpcache[k]
2071 del self.ftpcache[k]
2072 v.close()
2073 try:
2074 if key not in self.ftpcache:
2075 self.ftpcache[key] = \
2076 ftpwrapper(user, passwd, host, port, dirs)
2077 if not file: type = 'D'
2078 else: type = 'I'
2079 for attr in attrs:
2080 attr, value = _splitvalue(attr)
2081 if attr.lower() == 'type' and \
2082 value in ('a', 'A', 'i', 'I', 'd', 'D'):
2083 type = value.upper()
2084 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2085 mtype = mimetypes.guess_type("ftp:" + url)[0]
2086 headers = ""
2087 if mtype:
2088 headers += "Content-Type: %s\n" % mtype
2089 if retrlen is not None and retrlen >= 0:
2090 headers += "Content-Length: %d\n" % retrlen
2091 headers = email.message_from_string(headers)
2092 return addinfourl(fp, headers, "ftp:" + url)
2093 except ftperrors() as exp:
2094 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2096 def open_data(self, url, data=None):
2097 """Use "data" URL."""
2098 if not isinstance(url, str):
2099 raise URLError('data error: proxy support for data protocol currently not implemented')
2100 # ignore POSTed data
2101 #
2102 # syntax of data URLs:
2103 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
2104 # mediatype := [ type "/" subtype ] *( ";" parameter )
2105 # data := *urlchar
2106 # parameter := attribute "=" value
2107 try:
2108 [type, data] = url.split(',', 1)
2109 except ValueError:
2110 raise OSError('data error', 'bad data URL')
2111 if not type:
2112 type = 'text/plain;charset=US-ASCII'
2113 semi = type.rfind(';')
2114 if semi >= 0 and '=' not in type[semi:]:
2115 encoding = type[semi+1:]
2116 type = type[:semi]
2117 else:
2118 encoding = ''
2119 msg = []
2120 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2121 time.gmtime(time.time())))
2122 msg.append('Content-type: %s' % type)
2123 if encoding == 'base64':
2124 # XXX is this encoding/decoding ok?
2125 data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2126 else:
2127 data = unquote(data)
2128 msg.append('Content-Length: %d' % len(data))
2129 msg.append('')
2130 msg.append(data)
2131 msg = '\n'.join(msg)
2132 headers = email.message_from_string(msg)
2133 f = io.StringIO(msg)
2134 #f.fileno = None # needed for addinfourl
2135 return addinfourl(f, headers, url)
2138class FancyURLopener(URLopener):
2139 """Derived class with handlers for errors we can handle (perhaps)."""
2141 def __init__(self, *args, **kwargs):
2142 URLopener.__init__(self, *args, **kwargs)
2143 self.auth_cache = {}
2144 self.tries = 0
2145 self.maxtries = 10
2147 def http_error_default(self, url, fp, errcode, errmsg, headers):
2148 """Default error handling -- don't raise an exception."""
2149 return addinfourl(fp, headers, "http:" + url, errcode)
2151 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2152 """Error 302 -- relocated (temporarily)."""
2153 self.tries += 1
2154 try:
2155 if self.maxtries and self.tries >= self.maxtries:
2156 if hasattr(self, "http_error_500"):
2157 meth = self.http_error_500
2158 else:
2159 meth = self.http_error_default
2160 return meth(url, fp, 500,
2161 "Internal Server Error: Redirect Recursion",
2162 headers)
2163 result = self.redirect_internal(url, fp, errcode, errmsg,
2164 headers, data)
2165 return result
2166 finally:
2167 self.tries = 0
2169 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2170 if 'location' in headers:
2171 newurl = headers['location']
2172 elif 'uri' in headers:
2173 newurl = headers['uri']
2174 else:
2175 return
2176 fp.close()
2178 # In case the server sent a relative URL, join with original:
2179 newurl = urljoin(self.type + ":" + url, newurl)
2181 urlparts = urlparse(newurl)
2183 # For security reasons, we don't allow redirection to anything other
2184 # than http, https and ftp.
2186 # We are using newer HTTPError with older redirect_internal method
2187 # This older method will get deprecated in 3.3
2189 if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2190 raise HTTPError(newurl, errcode,
2191 errmsg +
2192 " Redirection to url '%s' is not allowed." % newurl,
2193 headers, fp)
2195 return self.open(newurl)
2197 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2198 """Error 301 -- also relocated (permanently)."""
2199 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2201 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2202 """Error 303 -- also relocated (essentially identical to 302)."""
2203 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2205 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2206 """Error 307 -- relocated, but turn POST into error."""
2207 if data is None:
2208 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2209 else:
2210 return self.http_error_default(url, fp, errcode, errmsg, headers)
2212 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2213 retry=False):
2214 """Error 401 -- authentication required.
2215 This function supports Basic authentication only."""
2216 if 'www-authenticate' not in headers:
2217 URLopener.http_error_default(self, url, fp,
2218 errcode, errmsg, headers)
2219 stuff = headers['www-authenticate']
2220 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2221 if not match:
2222 URLopener.http_error_default(self, url, fp,
2223 errcode, errmsg, headers)
2224 scheme, realm = match.groups()
2225 if scheme.lower() != 'basic':
2226 URLopener.http_error_default(self, url, fp,
2227 errcode, errmsg, headers)
2228 if not retry:
2229 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2230 headers)
2231 name = 'retry_' + self.type + '_basic_auth'
2232 if data is None:
2233 return getattr(self,name)(url, realm)
2234 else:
2235 return getattr(self,name)(url, realm, data)
2237 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2238 retry=False):
2239 """Error 407 -- proxy authentication required.
2240 This function supports Basic authentication only."""
2241 if 'proxy-authenticate' not in headers:
2242 URLopener.http_error_default(self, url, fp,
2243 errcode, errmsg, headers)
2244 stuff = headers['proxy-authenticate']
2245 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2246 if not match:
2247 URLopener.http_error_default(self, url, fp,
2248 errcode, errmsg, headers)
2249 scheme, realm = match.groups()
2250 if scheme.lower() != 'basic':
2251 URLopener.http_error_default(self, url, fp,
2252 errcode, errmsg, headers)
2253 if not retry:
2254 URLopener.http_error_default(self, url, fp, errcode, errmsg,
2255 headers)
2256 name = 'retry_proxy_' + self.type + '_basic_auth'
2257 if data is None:
2258 return getattr(self,name)(url, realm)
2259 else:
2260 return getattr(self,name)(url, realm, data)
2262 def retry_proxy_http_basic_auth(self, url, realm, data=None):
2263 host, selector = _splithost(url)
2264 newurl = 'http://' + host + selector
2265 proxy = self.proxies['http']
2266 urltype, proxyhost = _splittype(proxy)
2267 proxyhost, proxyselector = _splithost(proxyhost)
2268 i = proxyhost.find('@') + 1
2269 proxyhost = proxyhost[i:]
2270 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2271 if not (user or passwd): return None
2272 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2273 quote(passwd, safe=''), proxyhost)
2274 self.proxies['http'] = 'http://' + proxyhost + proxyselector
2275 if data is None:
2276 return self.open(newurl)
2277 else:
2278 return self.open(newurl, data)
2280 def retry_proxy_https_basic_auth(self, url, realm, data=None):
2281 host, selector = _splithost(url)
2282 newurl = 'https://' + host + selector
2283 proxy = self.proxies['https']
2284 urltype, proxyhost = _splittype(proxy)
2285 proxyhost, proxyselector = _splithost(proxyhost)
2286 i = proxyhost.find('@') + 1
2287 proxyhost = proxyhost[i:]
2288 user, passwd = self.get_user_passwd(proxyhost, realm, i)
2289 if not (user or passwd): return None
2290 proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2291 quote(passwd, safe=''), proxyhost)
2292 self.proxies['https'] = 'https://' + proxyhost + proxyselector
2293 if data is None:
2294 return self.open(newurl)
2295 else:
2296 return self.open(newurl, data)
2298 def retry_http_basic_auth(self, url, realm, data=None):
2299 host, selector = _splithost(url)
2300 i = host.find('@') + 1
2301 host = host[i:]
2302 user, passwd = self.get_user_passwd(host, realm, i)
2303 if not (user or passwd): return None
2304 host = "%s:%s@%s" % (quote(user, safe=''),
2305 quote(passwd, safe=''), host)
2306 newurl = 'http://' + host + selector
2307 if data is None:
2308 return self.open(newurl)
2309 else:
2310 return self.open(newurl, data)
2312 def retry_https_basic_auth(self, url, realm, data=None):
2313 host, selector = _splithost(url)
2314 i = host.find('@') + 1
2315 host = host[i:]
2316 user, passwd = self.get_user_passwd(host, realm, i)
2317 if not (user or passwd): return None
2318 host = "%s:%s@%s" % (quote(user, safe=''),
2319 quote(passwd, safe=''), host)
2320 newurl = 'https://' + host + selector
2321 if data is None:
2322 return self.open(newurl)
2323 else:
2324 return self.open(newurl, data)
2326 def get_user_passwd(self, host, realm, clear_cache=0):
2327 key = realm + '@' + host.lower()
2328 if key in self.auth_cache:
2329 if clear_cache:
2330 del self.auth_cache[key]
2331 else:
2332 return self.auth_cache[key]
2333 user, passwd = self.prompt_user_passwd(host, realm)
2334 if user or passwd: self.auth_cache[key] = (user, passwd)
2335 return user, passwd
2337 def prompt_user_passwd(self, host, realm):
2338 """Override this in a GUI environment!"""
2339 import getpass
2340 try:
2341 user = input("Enter username for %s at %s: " % (realm, host))
2342 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2343 (user, realm, host))
2344 return user, passwd
2345 except KeyboardInterrupt:
2346 print()
2347 return None, None
2350# Utility functions
2352_localhost = None
2353def localhost():
2354 """Return the IP address of the magic hostname 'localhost'."""
2355 global _localhost
2356 if _localhost is None:
2357 _localhost = socket.gethostbyname('localhost')
2358 return _localhost
2360_thishost = None
2361def thishost():
2362 """Return the IP addresses of the current host."""
2363 global _thishost
2364 if _thishost is None:
2365 try:
2366 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2367 except socket.gaierror:
2368 _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2369 return _thishost
2371_ftperrors = None
2372def ftperrors():
2373 """Return the set of errors raised by the FTP class."""
2374 global _ftperrors
2375 if _ftperrors is None:
2376 import ftplib
2377 _ftperrors = ftplib.all_errors
2378 return _ftperrors
2380_noheaders = None
2381def noheaders():
2382 """Return an empty email Message object."""
2383 global _noheaders
2384 if _noheaders is None:
2385 _noheaders = email.message_from_string("")
2386 return _noheaders
2389# Utility classes
2391class ftpwrapper:
2392 """Class used by open_ftp() for cache of open FTP connections."""
2394 def __init__(self, user, passwd, host, port, dirs, timeout=None,
2395 persistent=True):
2396 self.user = user
2397 self.passwd = passwd
2398 self.host = host
2399 self.port = port
2400 self.dirs = dirs
2401 self.timeout = timeout
2402 self.refcount = 0
2403 self.keepalive = persistent
2404 try:
2405 self.init()
2406 except:
2407 self.close()
2408 raise
2410 def init(self):
2411 import ftplib
2412 self.busy = 0
2413 self.ftp = ftplib.FTP()
2414 self.ftp.connect(self.host, self.port, self.timeout)
2415 self.ftp.login(self.user, self.passwd)
2416 _target = '/'.join(self.dirs)
2417 self.ftp.cwd(_target)
2419 def retrfile(self, file, type):
2420 import ftplib
2421 self.endtransfer()
2422 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2423 else: cmd = 'TYPE ' + type; isdir = 0
2424 try:
2425 self.ftp.voidcmd(cmd)
2426 except ftplib.all_errors:
2427 self.init()
2428 self.ftp.voidcmd(cmd)
2429 conn = None
2430 if file and not isdir:
2431 # Try to retrieve as a file
2432 try:
2433 cmd = 'RETR ' + file
2434 conn, retrlen = self.ftp.ntransfercmd(cmd)
2435 except ftplib.error_perm as reason:
2436 if str(reason)[:3] != '550':
2437 raise URLError('ftp error: %r' % reason).with_traceback(
2438 sys.exc_info()[2])
2439 if not conn:
2440 # Set transfer mode to ASCII!
2441 self.ftp.voidcmd('TYPE A')
2442 # Try a directory listing. Verify that directory exists.
2443 if file:
2444 pwd = self.ftp.pwd()
2445 try:
2446 try:
2447 self.ftp.cwd(file)
2448 except ftplib.error_perm as reason:
2449 raise URLError('ftp error: %r' % reason) from reason
2450 finally:
2451 self.ftp.cwd(pwd)
2452 cmd = 'LIST ' + file
2453 else:
2454 cmd = 'LIST'
2455 conn, retrlen = self.ftp.ntransfercmd(cmd)
2456 self.busy = 1
2458 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2459 self.refcount += 1
2460 conn.close()
2461 # Pass back both a suitably decorated object and a retrieval length
2462 return (ftpobj, retrlen)
2464 def endtransfer(self):
2465 self.busy = 0
2467 def close(self):
2468 self.keepalive = False
2469 if self.refcount <= 0:
2470 self.real_close()
2472 def file_close(self):
2473 self.endtransfer()
2474 self.refcount -= 1
2475 if self.refcount <= 0 and not self.keepalive:
2476 self.real_close()
2478 def real_close(self):
2479 self.endtransfer()
2480 try:
2481 self.ftp.close()
2482 except ftperrors():
2483 pass
2485# Proxy handling
2486def getproxies_environment():
2487 """Return a dictionary of scheme -> proxy server URL mappings.
2489 Scan the environment for variables named <scheme>_proxy;
2490 this seems to be the standard convention. If you need a
2491 different way, you can pass a proxies dictionary to the
2492 [Fancy]URLopener constructor.
2494 """
2495 proxies = {}
2496 # in order to prefer lowercase variables, process environment in
2497 # two passes: first matches any, second pass matches lowercase only
2498 for name, value in os.environ.items():
2499 name = name.lower()
2500 if value and name[-6:] == '_proxy':
2501 proxies[name[:-6]] = value
2502 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2503 # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2504 # header from the client
2505 # If "proxy" is lowercase, it will still be used thanks to the next block
2506 if 'REQUEST_METHOD' in os.environ:
2507 proxies.pop('http', None)
2508 for name, value in os.environ.items():
2509 if name[-6:] == '_proxy':
2510 name = name.lower()
2511 if value:
2512 proxies[name[:-6]] = value
2513 else:
2514 proxies.pop(name[:-6], None)
2515 return proxies
2517def proxy_bypass_environment(host, proxies=None):
2518 """Test if proxies should not be used for a particular host.
2520 Checks the proxy dict for the value of no_proxy, which should
2521 be a list of comma separated DNS suffixes, or '*' for all hosts.
2523 """
2524 if proxies is None:
2525 proxies = getproxies_environment()
2526 # don't bypass, if no_proxy isn't specified
2527 try:
2528 no_proxy = proxies['no']
2529 except KeyError:
2530 return False
2531 # '*' is special case for always bypass
2532 if no_proxy == '*':
2533 return True
2534 host = host.lower()
2535 # strip port off host
2536 hostonly, port = _splitport(host)
2537 # check if the host ends with any of the DNS suffixes
2538 for name in no_proxy.split(','):
2539 name = name.strip()
2540 if name:
2541 name = name.lstrip('.') # ignore leading dots
2542 name = name.lower()
2543 if hostonly == name or host == name:
2544 return True
2545 name = '.' + name
2546 if hostonly.endswith(name) or host.endswith(name):
2547 return True
2548 # otherwise, don't bypass
2549 return False
2552# This code tests an OSX specific data structure but is testable on all
2553# platforms
2554def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2555 """
2556 Return True iff this host shouldn't be accessed using a proxy
2558 This function uses the MacOSX framework SystemConfiguration
2559 to fetch the proxy information.
2561 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2562 { 'exclude_simple': bool,
2563 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2564 }
2565 """
2566 from fnmatch import fnmatch
2568 hostonly, port = _splitport(host)
2570 def ip2num(ipAddr):
2571 parts = ipAddr.split('.')
2572 parts = list(map(int, parts))
2573 if len(parts) != 4:
2574 parts = (parts + [0, 0, 0, 0])[:4]
2575 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2577 # Check for simple host names:
2578 if '.' not in host:
2579 if proxy_settings['exclude_simple']:
2580 return True
2582 hostIP = None
2584 for value in proxy_settings.get('exceptions', ()):
2585 # Items in the list are strings like these: *.local, 169.254/16
2586 if not value: continue
2588 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2589 if m is not None:
2590 if hostIP is None:
2591 try:
2592 hostIP = socket.gethostbyname(hostonly)
2593 hostIP = ip2num(hostIP)
2594 except OSError:
2595 continue
2597 base = ip2num(m.group(1))
2598 mask = m.group(2)
2599 if mask is None:
2600 mask = 8 * (m.group(1).count('.') + 1)
2601 else:
2602 mask = int(mask[1:])
2604 if mask < 0 or mask > 32:
2605 # System libraries ignore invalid prefix lengths
2606 continue
2608 mask = 32 - mask
2610 if (hostIP >> mask) == (base >> mask):
2611 return True
2613 elif fnmatch(host, value):
2614 return True
2616 return False
2619if sys.platform == 'darwin':
2620 from _scproxy import _get_proxy_settings, _get_proxies
2622 def proxy_bypass_macosx_sysconf(host):
2623 proxy_settings = _get_proxy_settings()
2624 return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2626 def getproxies_macosx_sysconf():
2627 """Return a dictionary of scheme -> proxy server URL mappings.
2629 This function uses the MacOSX framework SystemConfiguration
2630 to fetch the proxy information.
2631 """
2632 return _get_proxies()
2636 def proxy_bypass(host):
2637 """Return True, if host should be bypassed.
2639 Checks proxy settings gathered from the environment, if specified,
2640 or from the MacOSX framework SystemConfiguration.
2642 """
2643 proxies = getproxies_environment()
2644 if proxies:
2645 return proxy_bypass_environment(host, proxies)
2646 else:
2647 return proxy_bypass_macosx_sysconf(host)
2649 def getproxies():
2650 return getproxies_environment() or getproxies_macosx_sysconf()
2653elif os.name == 'nt':
2654 def getproxies_registry():
2655 """Return a dictionary of scheme -> proxy server URL mappings.
2657 Win32 uses the registry to store proxies.
2659 """
2660 proxies = {}
2661 try:
2662 import winreg
2663 except ImportError:
2664 # Std module, so should be around - but you never know!
2665 return proxies
2666 try:
2667 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2668 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2669 proxyEnable = winreg.QueryValueEx(internetSettings,
2670 'ProxyEnable')[0]
2671 if proxyEnable:
2672 # Returned as Unicode but problems if not converted to ASCII
2673 proxyServer = str(winreg.QueryValueEx(internetSettings,
2674 'ProxyServer')[0])
2675 if '=' in proxyServer:
2676 # Per-protocol settings
2677 for p in proxyServer.split(';'):
2678 protocol, address = p.split('=', 1)
2679 # See if address has a type:// prefix
2680 if not re.match('(?:[^/:]+)://', address):
2681 address = '%s://%s' % (protocol, address)
2682 proxies[protocol] = address
2683 else:
2684 # Use one setting for all protocols
2685 if proxyServer[:5] == 'http:':
2686 proxies['http'] = proxyServer
2687 else:
2688 proxies['http'] = 'http://%s' % proxyServer
2689 proxies['https'] = 'https://%s' % proxyServer
2690 proxies['ftp'] = 'ftp://%s' % proxyServer
2691 internetSettings.Close()
2692 except (OSError, ValueError, TypeError):
2693 # Either registry key not found etc, or the value in an
2694 # unexpected format.
2695 # proxies already set up to be empty so nothing to do
2696 pass
2697 return proxies
2699 def getproxies():
2700 """Return a dictionary of scheme -> proxy server URL mappings.
2702 Returns settings gathered from the environment, if specified,
2703 or the registry.
2705 """
2706 return getproxies_environment() or getproxies_registry()
2708 def proxy_bypass_registry(host):
2709 try:
2710 import winreg
2711 except ImportError:
2712 # Std modules, so should be around - but you never know!
2713 return 0
2714 try:
2715 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2716 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2717 proxyEnable = winreg.QueryValueEx(internetSettings,
2718 'ProxyEnable')[0]
2719 proxyOverride = str(winreg.QueryValueEx(internetSettings,
2720 'ProxyOverride')[0])
2721 # ^^^^ Returned as Unicode but problems if not converted to ASCII
2722 except OSError:
2723 return 0
2724 if not proxyEnable or not proxyOverride:
2725 return 0
2726 # try to make a host list from name and IP address.
2727 rawHost, port = _splitport(host)
2728 host = [rawHost]
2729 try:
2730 addr = socket.gethostbyname(rawHost)
2731 if addr != rawHost:
2732 host.append(addr)
2733 except OSError:
2734 pass
2735 try:
2736 fqdn = socket.getfqdn(rawHost)
2737 if fqdn != rawHost:
2738 host.append(fqdn)
2739 except OSError:
2740 pass
2741 # make a check value list from the registry entry: replace the
2742 # '<local>' string by the localhost entry and the corresponding
2743 # canonical entry.
2744 proxyOverride = proxyOverride.split(';')
2745 # now check if we match one of the registry values.
2746 for test in proxyOverride:
2747 if test == '<local>':
2748 if '.' not in rawHost:
2749 return 1
2750 test = test.replace(".", r"\.") # mask dots
2751 test = test.replace("*", r".*") # change glob sequence
2752 test = test.replace("?", r".") # change glob char
2753 for val in host:
2754 if re.match(test, val, re.I):
2755 return 1
2756 return 0
2758 def proxy_bypass(host):
2759 """Return True, if host should be bypassed.
2761 Checks proxy settings gathered from the environment, if specified,
2762 or the registry.
2764 """
2765 proxies = getproxies_environment()
2766 if proxies:
2767 return proxy_bypass_environment(host, proxies)
2768 else:
2769 return proxy_bypass_registry(host)
2771else:
2772 # By default use environment variables
2773 getproxies = getproxies_environment
2774 proxy_bypass = proxy_bypass_environment