• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166
167    This function always returns an object which can work as a
168    context manager and has the properties url, headers, and status.
169    See urllib.response.addinfourl for more detail on these properties.
170
171    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
172    object slightly modified. In addition to the three new methods above, the
173    msg attribute contains the same information as the reason attribute ---
174    the reason phrase returned by the server --- instead of the response
175    headers as it is specified in the documentation for HTTPResponse.
176
177    For FTP, file, and data URLs and requests explicitly handled by legacy
178    URLopener and FancyURLopener classes, this function returns a
179    urllib.response.addinfourl object.
180
181    Note that None may be returned if no handler handles the request (though
182    the default installed global OpenerDirector uses UnknownHandler to ensure
183    this never happens).
184
185    In addition, if proxy settings are detected (for example, when a *_proxy
186    environment variable like http_proxy is set), ProxyHandler is default
187    installed and makes sure the requests are handled through the proxy.
188
189    '''
190    global _opener
191    if cafile or capath or cadefault:
192        import warnings
193        warnings.warn("cafile, capath and cadefault are deprecated, use a "
194                      "custom context instead.", DeprecationWarning, 2)
195        if context is not None:
196            raise ValueError(
197                "You can't pass both context and any of cafile, capath, and "
198                "cadefault"
199            )
200        if not _have_ssl:
201            raise ValueError('SSL support not available')
202        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
203                                             cafile=cafile,
204                                             capath=capath)
205        https_handler = HTTPSHandler(context=context)
206        opener = build_opener(https_handler)
207    elif context:
208        https_handler = HTTPSHandler(context=context)
209        opener = build_opener(https_handler)
210    elif _opener is None:
211        _opener = opener = build_opener()
212    else:
213        opener = _opener
214    return opener.open(url, data, timeout)
215
216def install_opener(opener):
217    global _opener
218    _opener = opener
219
220_url_tempfiles = []
221def urlretrieve(url, filename=None, reporthook=None, data=None):
222    """
223    Retrieve a URL into a temporary location on disk.
224
225    Requires a URL argument. If a filename is passed, it is used as
226    the temporary file location. The reporthook argument should be
227    a callable that accepts a block number, a read size, and the
228    total file size of the URL target. The data argument should be
229    valid URL encoded data.
230
231    If a filename is passed and the URL points to a local resource,
232    the result is a copy from local file to new file.
233
234    Returns a tuple containing the path to the newly created
235    data file as well as the resulting HTTPMessage object.
236    """
237    url_type, path = _splittype(url)
238
239    with contextlib.closing(urlopen(url, data)) as fp:
240        headers = fp.info()
241
242        # Just return the local path and the "headers" for file://
243        # URLs. No sense in performing a copy unless requested.
244        if url_type == "file" and not filename:
245            return os.path.normpath(path), headers
246
247        # Handle temporary file setup.
248        if filename:
249            tfp = open(filename, 'wb')
250        else:
251            tfp = tempfile.NamedTemporaryFile(delete=False)
252            filename = tfp.name
253            _url_tempfiles.append(filename)
254
255        with tfp:
256            result = filename, headers
257            bs = 1024*8
258            size = -1
259            read = 0
260            blocknum = 0
261            if "content-length" in headers:
262                size = int(headers["Content-Length"])
263
264            if reporthook:
265                reporthook(blocknum, bs, size)
266
267            while True:
268                block = fp.read(bs)
269                if not block:
270                    break
271                read += len(block)
272                tfp.write(block)
273                blocknum += 1
274                if reporthook:
275                    reporthook(blocknum, bs, size)
276
277    if size >= 0 and read < size:
278        raise ContentTooShortError(
279            "retrieval incomplete: got only %i out of %i bytes"
280            % (read, size), result)
281
282    return result
283
284def urlcleanup():
285    """Clean up temporary files from urlretrieve calls."""
286    for temp_file in _url_tempfiles:
287        try:
288            os.unlink(temp_file)
289        except OSError:
290            pass
291
292    del _url_tempfiles[:]
293    global _opener
294    if _opener:
295        _opener = None
296
297# copied from cookielib.py
298_cut_port_re = re.compile(r":\d+$", re.ASCII)
299def request_host(request):
300    """Return request-host, as defined by RFC 2965.
301
302    Variation from RFC: returned value is lowercased, for convenient
303    comparison.
304
305    """
306    url = request.full_url
307    host = urlparse(url)[1]
308    if host == "":
309        host = request.get_header("Host", "")
310
311    # remove port, if present
312    host = _cut_port_re.sub("", host, 1)
313    return host.lower()
314
315class Request:
316
317    def __init__(self, url, data=None, headers={},
318                 origin_req_host=None, unverifiable=False,
319                 method=None):
320        self.full_url = url
321        self.headers = {}
322        self.unredirected_hdrs = {}
323        self._data = None
324        self.data = data
325        self._tunnel_host = None
326        for key, value in headers.items():
327            self.add_header(key, value)
328        if origin_req_host is None:
329            origin_req_host = request_host(self)
330        self.origin_req_host = origin_req_host
331        self.unverifiable = unverifiable
332        if method:
333            self.method = method
334
335    @property
336    def full_url(self):
337        if self.fragment:
338            return '{}#{}'.format(self._full_url, self.fragment)
339        return self._full_url
340
341    @full_url.setter
342    def full_url(self, url):
343        # unwrap('<URL:type://host/path>') --> 'type://host/path'
344        self._full_url = unwrap(url)
345        self._full_url, self.fragment = _splittag(self._full_url)
346        self._parse()
347
348    @full_url.deleter
349    def full_url(self):
350        self._full_url = None
351        self.fragment = None
352        self.selector = ''
353
354    @property
355    def data(self):
356        return self._data
357
358    @data.setter
359    def data(self, data):
360        if data != self._data:
361            self._data = data
362            # issue 16464
363            # if we change data we need to remove content-length header
364            # (cause it's most probably calculated for previous value)
365            if self.has_header("Content-length"):
366                self.remove_header("Content-length")
367
368    @data.deleter
369    def data(self):
370        self.data = None
371
372    def _parse(self):
373        self.type, rest = _splittype(self._full_url)
374        if self.type is None:
375            raise ValueError("unknown url type: %r" % self.full_url)
376        self.host, self.selector = _splithost(rest)
377        if self.host:
378            self.host = unquote(self.host)
379
380    def get_method(self):
381        """Return a string indicating the HTTP request method."""
382        default_method = "POST" if self.data is not None else "GET"
383        return getattr(self, 'method', default_method)
384
385    def get_full_url(self):
386        return self.full_url
387
388    def set_proxy(self, host, type):
389        if self.type == 'https' and not self._tunnel_host:
390            self._tunnel_host = self.host
391        else:
392            self.type= type
393            self.selector = self.full_url
394        self.host = host
395
396    def has_proxy(self):
397        return self.selector == self.full_url
398
399    def add_header(self, key, val):
400        # useful for something like authentication
401        self.headers[key.capitalize()] = val
402
403    def add_unredirected_header(self, key, val):
404        # will not be added to a redirected request
405        self.unredirected_hdrs[key.capitalize()] = val
406
407    def has_header(self, header_name):
408        return (header_name in self.headers or
409                header_name in self.unredirected_hdrs)
410
411    def get_header(self, header_name, default=None):
412        return self.headers.get(
413            header_name,
414            self.unredirected_hdrs.get(header_name, default))
415
416    def remove_header(self, header_name):
417        self.headers.pop(header_name, None)
418        self.unredirected_hdrs.pop(header_name, None)
419
420    def header_items(self):
421        hdrs = {**self.unredirected_hdrs, **self.headers}
422        return list(hdrs.items())
423
424class OpenerDirector:
425    def __init__(self):
426        client_version = "Python-urllib/%s" % __version__
427        self.addheaders = [('User-agent', client_version)]
428        # self.handlers is retained only for backward compatibility
429        self.handlers = []
430        # manage the individual handlers
431        self.handle_open = {}
432        self.handle_error = {}
433        self.process_response = {}
434        self.process_request = {}
435
436    def add_handler(self, handler):
437        if not hasattr(handler, "add_parent"):
438            raise TypeError("expected BaseHandler instance, got %r" %
439                            type(handler))
440
441        added = False
442        for meth in dir(handler):
443            if meth in ["redirect_request", "do_open", "proxy_open"]:
444                # oops, coincidental match
445                continue
446
447            i = meth.find("_")
448            protocol = meth[:i]
449            condition = meth[i+1:]
450
451            if condition.startswith("error"):
452                j = condition.find("_") + i + 1
453                kind = meth[j+1:]
454                try:
455                    kind = int(kind)
456                except ValueError:
457                    pass
458                lookup = self.handle_error.get(protocol, {})
459                self.handle_error[protocol] = lookup
460            elif condition == "open":
461                kind = protocol
462                lookup = self.handle_open
463            elif condition == "response":
464                kind = protocol
465                lookup = self.process_response
466            elif condition == "request":
467                kind = protocol
468                lookup = self.process_request
469            else:
470                continue
471
472            handlers = lookup.setdefault(kind, [])
473            if handlers:
474                bisect.insort(handlers, handler)
475            else:
476                handlers.append(handler)
477            added = True
478
479        if added:
480            bisect.insort(self.handlers, handler)
481            handler.add_parent(self)
482
483    def close(self):
484        # Only exists for backwards compatibility.
485        pass
486
487    def _call_chain(self, chain, kind, meth_name, *args):
488        # Handlers raise an exception if no one else should try to handle
489        # the request, or return None if they can't but another handler
490        # could.  Otherwise, they return the response.
491        handlers = chain.get(kind, ())
492        for handler in handlers:
493            func = getattr(handler, meth_name)
494            result = func(*args)
495            if result is not None:
496                return result
497
498    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
499        # accept a URL or a Request object
500        if isinstance(fullurl, str):
501            req = Request(fullurl, data)
502        else:
503            req = fullurl
504            if data is not None:
505                req.data = data
506
507        req.timeout = timeout
508        protocol = req.type
509
510        # pre-process request
511        meth_name = protocol+"_request"
512        for processor in self.process_request.get(protocol, []):
513            meth = getattr(processor, meth_name)
514            req = meth(req)
515
516        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
517        response = self._open(req, data)
518
519        # post-process response
520        meth_name = protocol+"_response"
521        for processor in self.process_response.get(protocol, []):
522            meth = getattr(processor, meth_name)
523            response = meth(req, response)
524
525        return response
526
527    def _open(self, req, data=None):
528        result = self._call_chain(self.handle_open, 'default',
529                                  'default_open', req)
530        if result:
531            return result
532
533        protocol = req.type
534        result = self._call_chain(self.handle_open, protocol, protocol +
535                                  '_open', req)
536        if result:
537            return result
538
539        return self._call_chain(self.handle_open, 'unknown',
540                                'unknown_open', req)
541
542    def error(self, proto, *args):
543        if proto in ('http', 'https'):
544            # XXX http[s] protocols are special-cased
545            dict = self.handle_error['http'] # https is not different than http
546            proto = args[2]  # YUCK!
547            meth_name = 'http_error_%s' % proto
548            http_err = 1
549            orig_args = args
550        else:
551            dict = self.handle_error
552            meth_name = proto + '_error'
553            http_err = 0
554        args = (dict, proto, meth_name) + args
555        result = self._call_chain(*args)
556        if result:
557            return result
558
559        if http_err:
560            args = (dict, 'default', 'http_error_default') + orig_args
561            return self._call_chain(*args)
562
563# XXX probably also want an abstract factory that knows when it makes
564# sense to skip a superclass in favor of a subclass and when it might
565# make sense to include both
566
567def build_opener(*handlers):
568    """Create an opener object from a list of handlers.
569
570    The opener will use several default handlers, including support
571    for HTTP, FTP and when applicable HTTPS.
572
573    If any of the handlers passed as arguments are subclasses of the
574    default handlers, the default handlers will not be used.
575    """
576    opener = OpenerDirector()
577    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
578                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
579                       FTPHandler, FileHandler, HTTPErrorProcessor,
580                       DataHandler]
581    if hasattr(http.client, "HTTPSConnection"):
582        default_classes.append(HTTPSHandler)
583    skip = set()
584    for klass in default_classes:
585        for check in handlers:
586            if isinstance(check, type):
587                if issubclass(check, klass):
588                    skip.add(klass)
589            elif isinstance(check, klass):
590                skip.add(klass)
591    for klass in skip:
592        default_classes.remove(klass)
593
594    for klass in default_classes:
595        opener.add_handler(klass())
596
597    for h in handlers:
598        if isinstance(h, type):
599            h = h()
600        opener.add_handler(h)
601    return opener
602
603class BaseHandler:
604    handler_order = 500
605
606    def add_parent(self, parent):
607        self.parent = parent
608
609    def close(self):
610        # Only exists for backwards compatibility
611        pass
612
613    def __lt__(self, other):
614        if not hasattr(other, "handler_order"):
615            # Try to preserve the old behavior of having custom classes
616            # inserted after default ones (works only for custom user
617            # classes which are not aware of handler_order).
618            return True
619        return self.handler_order < other.handler_order
620
621
622class HTTPErrorProcessor(BaseHandler):
623    """Process HTTP error responses."""
624    handler_order = 1000  # after all other processing
625
626    def http_response(self, request, response):
627        code, msg, hdrs = response.code, response.msg, response.info()
628
629        # According to RFC 2616, "2xx" code indicates that the client's
630        # request was successfully received, understood, and accepted.
631        if not (200 <= code < 300):
632            response = self.parent.error(
633                'http', request, response, code, msg, hdrs)
634
635        return response
636
637    https_response = http_response
638
639class HTTPDefaultErrorHandler(BaseHandler):
640    def http_error_default(self, req, fp, code, msg, hdrs):
641        raise HTTPError(req.full_url, code, msg, hdrs, fp)
642
643class HTTPRedirectHandler(BaseHandler):
644    # maximum number of redirections to any single URL
645    # this is needed because of the state that cookies introduce
646    max_repeats = 4
647    # maximum total number of redirections (regardless of URL) before
648    # assuming we're in a loop
649    max_redirections = 10
650
651    def redirect_request(self, req, fp, code, msg, headers, newurl):
652        """Return a Request or None in response to a redirect.
653
654        This is called by the http_error_30x methods when a
655        redirection response is received.  If a redirection should
656        take place, return a new Request to allow http_error_30x to
657        perform the redirect.  Otherwise, raise HTTPError if no-one
658        else should try to handle this url.  Return None if you can't
659        but another Handler might.
660        """
661        m = req.get_method()
662        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
663            or code in (301, 302, 303) and m == "POST")):
664            raise HTTPError(req.full_url, code, msg, headers, fp)
665
666        # Strictly (according to RFC 2616), 301 or 302 in response to
667        # a POST MUST NOT cause a redirection without confirmation
668        # from the user (of urllib.request, in this case).  In practice,
669        # essentially all clients do redirect in this case, so we do
670        # the same.
671
672        # Be conciliant with URIs containing a space.  This is mainly
673        # redundant with the more complete encoding done in http_error_302(),
674        # but it is kept for compatibility with other callers.
675        newurl = newurl.replace(' ', '%20')
676
677        CONTENT_HEADERS = ("content-length", "content-type")
678        newheaders = {k: v for k, v in req.headers.items()
679                      if k.lower() not in CONTENT_HEADERS}
680        return Request(newurl,
681                       headers=newheaders,
682                       origin_req_host=req.origin_req_host,
683                       unverifiable=True)
684
685    # Implementation note: To avoid the server sending us into an
686    # infinite loop, the request object needs to track what URLs we
687    # have already seen.  Do this by adding a handler-specific
688    # attribute to the Request object.
689    def http_error_302(self, req, fp, code, msg, headers):
690        # Some servers (incorrectly) return multiple Location headers
691        # (so probably same goes for URI).  Use first header.
692        if "location" in headers:
693            newurl = headers["location"]
694        elif "uri" in headers:
695            newurl = headers["uri"]
696        else:
697            return
698
699        # fix a possible malformed URL
700        urlparts = urlparse(newurl)
701
702        # For security reasons we don't allow redirection to anything other
703        # than http, https or ftp.
704
705        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
706            raise HTTPError(
707                newurl, code,
708                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
709                headers, fp)
710
711        if not urlparts.path and urlparts.netloc:
712            urlparts = list(urlparts)
713            urlparts[2] = "/"
714        newurl = urlunparse(urlparts)
715
716        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
717        # original bytes and percent-encode non-ASCII bytes, and any special
718        # characters such as the space.
719        newurl = quote(
720            newurl, encoding="iso-8859-1", safe=string.punctuation)
721        newurl = urljoin(req.full_url, newurl)
722
723        # XXX Probably want to forget about the state of the current
724        # request, although that might interact poorly with other
725        # handlers that also use handler-specific request attributes
726        new = self.redirect_request(req, fp, code, msg, headers, newurl)
727        if new is None:
728            return
729
730        # loop detection
731        # .redirect_dict has a key url if url was previously visited.
732        if hasattr(req, 'redirect_dict'):
733            visited = new.redirect_dict = req.redirect_dict
734            if (visited.get(newurl, 0) >= self.max_repeats or
735                len(visited) >= self.max_redirections):
736                raise HTTPError(req.full_url, code,
737                                self.inf_msg + msg, headers, fp)
738        else:
739            visited = new.redirect_dict = req.redirect_dict = {}
740        visited[newurl] = visited.get(newurl, 0) + 1
741
742        # Don't close the fp until we are sure that we won't use it
743        # with HTTPError.
744        fp.read()
745        fp.close()
746
747        return self.parent.open(new, timeout=req.timeout)
748
749    http_error_301 = http_error_303 = http_error_307 = http_error_302
750
751    inf_msg = "The HTTP server returned a redirect error that would " \
752              "lead to an infinite loop.\n" \
753              "The last 30x error message was:\n"
754
755
756def _parse_proxy(proxy):
757    """Return (scheme, user, password, host/port) given a URL or an authority.
758
759    If a URL is supplied, it must have an authority (host:port) component.
760    According to RFC 3986, having an authority component means the URL must
761    have two slashes after the scheme.
762    """
763    scheme, r_scheme = _splittype(proxy)
764    if not r_scheme.startswith("/"):
765        # authority
766        scheme = None
767        authority = proxy
768    else:
769        # URL
770        if not r_scheme.startswith("//"):
771            raise ValueError("proxy URL with no authority: %r" % proxy)
772        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
773        # and 3.3.), path is empty or starts with '/'
774        end = r_scheme.find("/", 2)
775        if end == -1:
776            end = None
777        authority = r_scheme[2:end]
778    userinfo, hostport = _splituser(authority)
779    if userinfo is not None:
780        user, password = _splitpasswd(userinfo)
781    else:
782        user = password = None
783    return scheme, user, password, hostport
784
785class ProxyHandler(BaseHandler):
786    # Proxies must be in front
787    handler_order = 100
788
789    def __init__(self, proxies=None):
790        if proxies is None:
791            proxies = getproxies()
792        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
793        self.proxies = proxies
794        for type, url in proxies.items():
795            type = type.lower()
796            setattr(self, '%s_open' % type,
797                    lambda r, proxy=url, type=type, meth=self.proxy_open:
798                        meth(r, proxy, type))
799
800    def proxy_open(self, req, proxy, type):
801        orig_type = req.type
802        proxy_type, user, password, hostport = _parse_proxy(proxy)
803        if proxy_type is None:
804            proxy_type = orig_type
805
806        if req.host and proxy_bypass(req.host):
807            return None
808
809        if user and password:
810            user_pass = '%s:%s' % (unquote(user),
811                                   unquote(password))
812            creds = base64.b64encode(user_pass.encode()).decode("ascii")
813            req.add_header('Proxy-authorization', 'Basic ' + creds)
814        hostport = unquote(hostport)
815        req.set_proxy(hostport, proxy_type)
816        if orig_type == proxy_type or orig_type == 'https':
817            # let other handlers take care of it
818            return None
819        else:
820            # need to start over, because the other handlers don't
821            # grok the proxy's URL type
822            # e.g. if we have a constructor arg proxies like so:
823            # {'http': 'ftp://proxy.example.com'}, we may end up turning
824            # a request for http://acme.example.com/a into one for
825            # ftp://proxy.example.com/a
826            return self.parent.open(req, timeout=req.timeout)
827
828class HTTPPasswordMgr:
829
830    def __init__(self):
831        self.passwd = {}
832
833    def add_password(self, realm, uri, user, passwd):
834        # uri could be a single URI or a sequence
835        if isinstance(uri, str):
836            uri = [uri]
837        if realm not in self.passwd:
838            self.passwd[realm] = {}
839        for default_port in True, False:
840            reduced_uri = tuple(
841                self.reduce_uri(u, default_port) for u in uri)
842            self.passwd[realm][reduced_uri] = (user, passwd)
843
844    def find_user_password(self, realm, authuri):
845        domains = self.passwd.get(realm, {})
846        for default_port in True, False:
847            reduced_authuri = self.reduce_uri(authuri, default_port)
848            for uris, authinfo in domains.items():
849                for uri in uris:
850                    if self.is_suburi(uri, reduced_authuri):
851                        return authinfo
852        return None, None
853
854    def reduce_uri(self, uri, default_port=True):
855        """Accept authority or URI and extract only the authority and path."""
856        # note HTTP URLs do not have a userinfo component
857        parts = urlsplit(uri)
858        if parts[1]:
859            # URI
860            scheme = parts[0]
861            authority = parts[1]
862            path = parts[2] or '/'
863        else:
864            # host or host:port
865            scheme = None
866            authority = uri
867            path = '/'
868        host, port = _splitport(authority)
869        if default_port and port is None and scheme is not None:
870            dport = {"http": 80,
871                     "https": 443,
872                     }.get(scheme)
873            if dport is not None:
874                authority = "%s:%d" % (host, dport)
875        return authority, path
876
877    def is_suburi(self, base, test):
878        """Check if test is below base in a URI tree
879
880        Both args must be URIs in reduced form.
881        """
882        if base == test:
883            return True
884        if base[0] != test[0]:
885            return False
886        common = posixpath.commonprefix((base[1], test[1]))
887        if len(common) == len(base[1]):
888            return True
889        return False
890
891
892class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
893
894    def find_user_password(self, realm, authuri):
895        user, password = HTTPPasswordMgr.find_user_password(self, realm,
896                                                            authuri)
897        if user is not None:
898            return user, password
899        return HTTPPasswordMgr.find_user_password(self, None, authuri)
900
901
902class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
903
904    def __init__(self, *args, **kwargs):
905        self.authenticated = {}
906        super().__init__(*args, **kwargs)
907
908    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
909        self.update_authenticated(uri, is_authenticated)
910        # Add a default for prior auth requests
911        if realm is not None:
912            super().add_password(None, uri, user, passwd)
913        super().add_password(realm, uri, user, passwd)
914
915    def update_authenticated(self, uri, is_authenticated=False):
916        # uri could be a single URI or a sequence
917        if isinstance(uri, str):
918            uri = [uri]
919
920        for default_port in True, False:
921            for u in uri:
922                reduced_uri = self.reduce_uri(u, default_port)
923                self.authenticated[reduced_uri] = is_authenticated
924
925    def is_authenticated(self, authuri):
926        for default_port in True, False:
927            reduced_authuri = self.reduce_uri(authuri, default_port)
928            for uri in self.authenticated:
929                if self.is_suburi(uri, reduced_authuri):
930                    return self.authenticated[uri]
931
932
933class AbstractBasicAuthHandler:
934
935    # XXX this allows for multiple auth-schemes, but will stupidly pick
936    # the last one with a realm specified.
937
938    # allow for double- and single-quoted realm values
939    # (single quotes are a violation of the RFC, but appear in the wild)
940    rx = re.compile('(?:^|,)'   # start of the string or ','
941                    '[ \t]*'    # optional whitespaces
942                    '([^ \t]+)' # scheme like "Basic"
943                    '[ \t]+'    # mandatory whitespaces
944                    # realm=xxx
945                    # realm='xxx'
946                    # realm="xxx"
947                    'realm=(["\']?)([^"\']*)\\2',
948                    re.I)
949
950    # XXX could pre-emptively send auth info already accepted (RFC 2617,
951    # end of section 2, and section 1.2 immediately after "credentials"
952    # production).
953
954    def __init__(self, password_mgr=None):
955        if password_mgr is None:
956            password_mgr = HTTPPasswordMgr()
957        self.passwd = password_mgr
958        self.add_password = self.passwd.add_password
959
960    def _parse_realm(self, header):
961        # parse WWW-Authenticate header: accept multiple challenges per header
962        found_challenge = False
963        for mo in AbstractBasicAuthHandler.rx.finditer(header):
964            scheme, quote, realm = mo.groups()
965            if quote not in ['"', "'"]:
966                warnings.warn("Basic Auth Realm was unquoted",
967                              UserWarning, 3)
968
969            yield (scheme, realm)
970
971            found_challenge = True
972
973        if not found_challenge:
974            if header:
975                scheme = header.split()[0]
976            else:
977                scheme = ''
978            yield (scheme, None)
979
980    def http_error_auth_reqed(self, authreq, host, req, headers):
981        # host may be an authority (without userinfo) or a URL with an
982        # authority
983        headers = headers.get_all(authreq)
984        if not headers:
985            # no header found
986            return
987
988        unsupported = None
989        for header in headers:
990            for scheme, realm in self._parse_realm(header):
991                if scheme.lower() != 'basic':
992                    unsupported = scheme
993                    continue
994
995                if realm is not None:
996                    # Use the first matching Basic challenge.
997                    # Ignore following challenges even if they use the Basic
998                    # scheme.
999                    return self.retry_http_basic_auth(host, req, realm)
1000
1001        if unsupported is not None:
1002            raise ValueError("AbstractBasicAuthHandler does not "
1003                             "support the following scheme: %r"
1004                             % (scheme,))
1005
1006    def retry_http_basic_auth(self, host, req, realm):
1007        user, pw = self.passwd.find_user_password(realm, host)
1008        if pw is not None:
1009            raw = "%s:%s" % (user, pw)
1010            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1011            if req.get_header(self.auth_header, None) == auth:
1012                return None
1013            req.add_unredirected_header(self.auth_header, auth)
1014            return self.parent.open(req, timeout=req.timeout)
1015        else:
1016            return None
1017
1018    def http_request(self, req):
1019        if (not hasattr(self.passwd, 'is_authenticated') or
1020           not self.passwd.is_authenticated(req.full_url)):
1021            return req
1022
1023        if not req.has_header('Authorization'):
1024            user, passwd = self.passwd.find_user_password(None, req.full_url)
1025            credentials = '{0}:{1}'.format(user, passwd).encode()
1026            auth_str = base64.standard_b64encode(credentials).decode()
1027            req.add_unredirected_header('Authorization',
1028                                        'Basic {}'.format(auth_str.strip()))
1029        return req
1030
1031    def http_response(self, req, response):
1032        if hasattr(self.passwd, 'is_authenticated'):
1033            if 200 <= response.code < 300:
1034                self.passwd.update_authenticated(req.full_url, True)
1035            else:
1036                self.passwd.update_authenticated(req.full_url, False)
1037        return response
1038
1039    https_request = http_request
1040    https_response = http_response
1041
1042
1043
1044class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1045
1046    auth_header = 'Authorization'
1047
1048    def http_error_401(self, req, fp, code, msg, headers):
1049        url = req.full_url
1050        response = self.http_error_auth_reqed('www-authenticate',
1051                                          url, req, headers)
1052        return response
1053
1054
1055class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1056
1057    auth_header = 'Proxy-authorization'
1058
1059    def http_error_407(self, req, fp, code, msg, headers):
1060        # http_error_auth_reqed requires that there is no userinfo component in
1061        # authority.  Assume there isn't one, since urllib.request does not (and
1062        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1063        # userinfo.
1064        authority = req.host
1065        response = self.http_error_auth_reqed('proxy-authenticate',
1066                                          authority, req, headers)
1067        return response
1068
1069
1070# Return n random bytes.
1071_randombytes = os.urandom
1072
1073
1074class AbstractDigestAuthHandler:
1075    # Digest authentication is specified in RFC 2617.
1076
1077    # XXX The client does not inspect the Authentication-Info header
1078    # in a successful response.
1079
1080    # XXX It should be possible to test this implementation against
1081    # a mock server that just generates a static set of challenges.
1082
1083    # XXX qop="auth-int" supports is shaky
1084
1085    def __init__(self, passwd=None):
1086        if passwd is None:
1087            passwd = HTTPPasswordMgr()
1088        self.passwd = passwd
1089        self.add_password = self.passwd.add_password
1090        self.retried = 0
1091        self.nonce_count = 0
1092        self.last_nonce = None
1093
1094    def reset_retry_count(self):
1095        self.retried = 0
1096
1097    def http_error_auth_reqed(self, auth_header, host, req, headers):
1098        authreq = headers.get(auth_header, None)
1099        if self.retried > 5:
1100            # Don't fail endlessly - if we failed once, we'll probably
1101            # fail a second time. Hm. Unless the Password Manager is
1102            # prompting for the information. Crap. This isn't great
1103            # but it's better than the current 'repeat until recursion
1104            # depth exceeded' approach <wink>
1105            raise HTTPError(req.full_url, 401, "digest auth failed",
1106                            headers, None)
1107        else:
1108            self.retried += 1
1109        if authreq:
1110            scheme = authreq.split()[0]
1111            if scheme.lower() == 'digest':
1112                return self.retry_http_digest_auth(req, authreq)
1113            elif scheme.lower() != 'basic':
1114                raise ValueError("AbstractDigestAuthHandler does not support"
1115                                 " the following scheme: '%s'" % scheme)
1116
1117    def retry_http_digest_auth(self, req, auth):
1118        token, challenge = auth.split(' ', 1)
1119        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1120        auth = self.get_authorization(req, chal)
1121        if auth:
1122            auth_val = 'Digest %s' % auth
1123            if req.headers.get(self.auth_header, None) == auth_val:
1124                return None
1125            req.add_unredirected_header(self.auth_header, auth_val)
1126            resp = self.parent.open(req, timeout=req.timeout)
1127            return resp
1128
1129    def get_cnonce(self, nonce):
1130        # The cnonce-value is an opaque
1131        # quoted string value provided by the client and used by both client
1132        # and server to avoid chosen plaintext attacks, to provide mutual
1133        # authentication, and to provide some message integrity protection.
1134        # This isn't a fabulous effort, but it's probably Good Enough.
1135        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1136        b = s.encode("ascii") + _randombytes(8)
1137        dig = hashlib.sha1(b).hexdigest()
1138        return dig[:16]
1139
1140    def get_authorization(self, req, chal):
1141        try:
1142            realm = chal['realm']
1143            nonce = chal['nonce']
1144            qop = chal.get('qop')
1145            algorithm = chal.get('algorithm', 'MD5')
1146            # mod_digest doesn't send an opaque, even though it isn't
1147            # supposed to be optional
1148            opaque = chal.get('opaque', None)
1149        except KeyError:
1150            return None
1151
1152        H, KD = self.get_algorithm_impls(algorithm)
1153        if H is None:
1154            return None
1155
1156        user, pw = self.passwd.find_user_password(realm, req.full_url)
1157        if user is None:
1158            return None
1159
1160        # XXX not implemented yet
1161        if req.data is not None:
1162            entdig = self.get_entity_digest(req.data, chal)
1163        else:
1164            entdig = None
1165
1166        A1 = "%s:%s:%s" % (user, realm, pw)
1167        A2 = "%s:%s" % (req.get_method(),
1168                        # XXX selector: what about proxies and full urls
1169                        req.selector)
1170        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1171        #     or `auth-int` to the response back. we use `auth` to send the response back.
1172        if qop is None:
1173            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1174        elif 'auth' in qop.split(','):
1175            if nonce == self.last_nonce:
1176                self.nonce_count += 1
1177            else:
1178                self.nonce_count = 1
1179                self.last_nonce = nonce
1180            ncvalue = '%08x' % self.nonce_count
1181            cnonce = self.get_cnonce(nonce)
1182            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1183            respdig = KD(H(A1), noncebit)
1184        else:
1185            # XXX handle auth-int.
1186            raise URLError("qop '%s' is not supported." % qop)
1187
1188        # XXX should the partial digests be encoded too?
1189
1190        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1191               'response="%s"' % (user, realm, nonce, req.selector,
1192                                  respdig)
1193        if opaque:
1194            base += ', opaque="%s"' % opaque
1195        if entdig:
1196            base += ', digest="%s"' % entdig
1197        base += ', algorithm="%s"' % algorithm
1198        if qop:
1199            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1200        return base
1201
1202    def get_algorithm_impls(self, algorithm):
1203        # lambdas assume digest modules are imported at the top level
1204        if algorithm == 'MD5':
1205            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1206        elif algorithm == 'SHA':
1207            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1208        # XXX MD5-sess
1209        else:
1210            raise ValueError("Unsupported digest authentication "
1211                             "algorithm %r" % algorithm)
1212        KD = lambda s, d: H("%s:%s" % (s, d))
1213        return H, KD
1214
1215    def get_entity_digest(self, data, chal):
1216        # XXX not implemented yet
1217        return None
1218
1219
1220class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1221    """An authentication protocol defined by RFC 2069
1222
1223    Digest authentication improves on basic authentication because it
1224    does not transmit passwords in the clear.
1225    """
1226
1227    auth_header = 'Authorization'
1228    handler_order = 490  # before Basic auth
1229
1230    def http_error_401(self, req, fp, code, msg, headers):
1231        host = urlparse(req.full_url)[1]
1232        retry = self.http_error_auth_reqed('www-authenticate',
1233                                           host, req, headers)
1234        self.reset_retry_count()
1235        return retry
1236
1237
1238class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1239
1240    auth_header = 'Proxy-Authorization'
1241    handler_order = 490  # before Basic auth
1242
1243    def http_error_407(self, req, fp, code, msg, headers):
1244        host = req.host
1245        retry = self.http_error_auth_reqed('proxy-authenticate',
1246                                           host, req, headers)
1247        self.reset_retry_count()
1248        return retry
1249
1250class AbstractHTTPHandler(BaseHandler):
1251
1252    def __init__(self, debuglevel=0):
1253        self._debuglevel = debuglevel
1254
1255    def set_http_debuglevel(self, level):
1256        self._debuglevel = level
1257
1258    def _get_content_length(self, request):
1259        return http.client.HTTPConnection._get_content_length(
1260            request.data,
1261            request.get_method())
1262
1263    def do_request_(self, request):
1264        host = request.host
1265        if not host:
1266            raise URLError('no host given')
1267
1268        if request.data is not None:  # POST
1269            data = request.data
1270            if isinstance(data, str):
1271                msg = "POST data should be bytes, an iterable of bytes, " \
1272                      "or a file object. It cannot be of type str."
1273                raise TypeError(msg)
1274            if not request.has_header('Content-type'):
1275                request.add_unredirected_header(
1276                    'Content-type',
1277                    'application/x-www-form-urlencoded')
1278            if (not request.has_header('Content-length')
1279                    and not request.has_header('Transfer-encoding')):
1280                content_length = self._get_content_length(request)
1281                if content_length is not None:
1282                    request.add_unredirected_header(
1283                            'Content-length', str(content_length))
1284                else:
1285                    request.add_unredirected_header(
1286                            'Transfer-encoding', 'chunked')
1287
1288        sel_host = host
1289        if request.has_proxy():
1290            scheme, sel = _splittype(request.selector)
1291            sel_host, sel_path = _splithost(sel)
1292        if not request.has_header('Host'):
1293            request.add_unredirected_header('Host', sel_host)
1294        for name, value in self.parent.addheaders:
1295            name = name.capitalize()
1296            if not request.has_header(name):
1297                request.add_unredirected_header(name, value)
1298
1299        return request
1300
1301    def do_open(self, http_class, req, **http_conn_args):
1302        """Return an HTTPResponse object for the request, using http_class.
1303
1304        http_class must implement the HTTPConnection API from http.client.
1305        """
1306        host = req.host
1307        if not host:
1308            raise URLError('no host given')
1309
1310        # will parse host:port
1311        h = http_class(host, timeout=req.timeout, **http_conn_args)
1312        h.set_debuglevel(self._debuglevel)
1313
1314        headers = dict(req.unredirected_hdrs)
1315        headers.update({k: v for k, v in req.headers.items()
1316                        if k not in headers})
1317
1318        # TODO(jhylton): Should this be redesigned to handle
1319        # persistent connections?
1320
1321        # We want to make an HTTP/1.1 request, but the addinfourl
1322        # class isn't prepared to deal with a persistent connection.
1323        # It will try to read all remaining data from the socket,
1324        # which will block while the server waits for the next request.
1325        # So make sure the connection gets closed after the (only)
1326        # request.
1327        headers["Connection"] = "close"
1328        headers = {name.title(): val for name, val in headers.items()}
1329
1330        if req._tunnel_host:
1331            tunnel_headers = {}
1332            proxy_auth_hdr = "Proxy-Authorization"
1333            if proxy_auth_hdr in headers:
1334                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1335                # Proxy-Authorization should not be sent to origin
1336                # server.
1337                del headers[proxy_auth_hdr]
1338            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1339
1340        try:
1341            try:
1342                h.request(req.get_method(), req.selector, req.data, headers,
1343                          encode_chunked=req.has_header('Transfer-encoding'))
1344            except OSError as err: # timeout error
1345                raise URLError(err)
1346            r = h.getresponse()
1347        except:
1348            h.close()
1349            raise
1350
1351        # If the server does not send us a 'Connection: close' header,
1352        # HTTPConnection assumes the socket should be left open. Manually
1353        # mark the socket to be closed when this response object goes away.
1354        if h.sock:
1355            h.sock.close()
1356            h.sock = None
1357
1358        r.url = req.get_full_url()
1359        # This line replaces the .msg attribute of the HTTPResponse
1360        # with .headers, because urllib clients expect the response to
1361        # have the reason in .msg.  It would be good to mark this
1362        # attribute is deprecated and get then to use info() or
1363        # .headers.
1364        r.msg = r.reason
1365        return r
1366
1367
1368class HTTPHandler(AbstractHTTPHandler):
1369
1370    def http_open(self, req):
1371        return self.do_open(http.client.HTTPConnection, req)
1372
1373    http_request = AbstractHTTPHandler.do_request_
1374
1375if hasattr(http.client, 'HTTPSConnection'):
1376
1377    class HTTPSHandler(AbstractHTTPHandler):
1378
1379        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1380            AbstractHTTPHandler.__init__(self, debuglevel)
1381            self._context = context
1382            self._check_hostname = check_hostname
1383
1384        def https_open(self, req):
1385            return self.do_open(http.client.HTTPSConnection, req,
1386                context=self._context, check_hostname=self._check_hostname)
1387
1388        https_request = AbstractHTTPHandler.do_request_
1389
1390    __all__.append('HTTPSHandler')
1391
1392class HTTPCookieProcessor(BaseHandler):
1393    def __init__(self, cookiejar=None):
1394        import http.cookiejar
1395        if cookiejar is None:
1396            cookiejar = http.cookiejar.CookieJar()
1397        self.cookiejar = cookiejar
1398
1399    def http_request(self, request):
1400        self.cookiejar.add_cookie_header(request)
1401        return request
1402
1403    def http_response(self, request, response):
1404        self.cookiejar.extract_cookies(response, request)
1405        return response
1406
1407    https_request = http_request
1408    https_response = http_response
1409
1410class UnknownHandler(BaseHandler):
1411    def unknown_open(self, req):
1412        type = req.type
1413        raise URLError('unknown url type: %s' % type)
1414
1415def parse_keqv_list(l):
1416    """Parse list of key=value strings where keys are not duplicated."""
1417    parsed = {}
1418    for elt in l:
1419        k, v = elt.split('=', 1)
1420        if v[0] == '"' and v[-1] == '"':
1421            v = v[1:-1]
1422        parsed[k] = v
1423    return parsed
1424
1425def parse_http_list(s):
1426    """Parse lists as described by RFC 2068 Section 2.
1427
1428    In particular, parse comma-separated lists where the elements of
1429    the list may include quoted-strings.  A quoted-string could
1430    contain a comma.  A non-quoted string could have quotes in the
1431    middle.  Neither commas nor quotes count if they are escaped.
1432    Only double-quotes count, not single-quotes.
1433    """
1434    res = []
1435    part = ''
1436
1437    escape = quote = False
1438    for cur in s:
1439        if escape:
1440            part += cur
1441            escape = False
1442            continue
1443        if quote:
1444            if cur == '\\':
1445                escape = True
1446                continue
1447            elif cur == '"':
1448                quote = False
1449            part += cur
1450            continue
1451
1452        if cur == ',':
1453            res.append(part)
1454            part = ''
1455            continue
1456
1457        if cur == '"':
1458            quote = True
1459
1460        part += cur
1461
1462    # append last part
1463    if part:
1464        res.append(part)
1465
1466    return [part.strip() for part in res]
1467
1468class FileHandler(BaseHandler):
1469    # Use local file or FTP depending on form of URL
1470    def file_open(self, req):
1471        url = req.selector
1472        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1473                req.host != 'localhost'):
1474            if not req.host in self.get_names():
1475                raise URLError("file:// scheme is supported only on localhost")
1476        else:
1477            return self.open_local_file(req)
1478
1479    # names for the localhost
1480    names = None
1481    def get_names(self):
1482        if FileHandler.names is None:
1483            try:
1484                FileHandler.names = tuple(
1485                    socket.gethostbyname_ex('localhost')[2] +
1486                    socket.gethostbyname_ex(socket.gethostname())[2])
1487            except socket.gaierror:
1488                FileHandler.names = (socket.gethostbyname('localhost'),)
1489        return FileHandler.names
1490
1491    # not entirely sure what the rules are here
1492    def open_local_file(self, req):
1493        import email.utils
1494        import mimetypes
1495        host = req.host
1496        filename = req.selector
1497        localfile = url2pathname(filename)
1498        try:
1499            stats = os.stat(localfile)
1500            size = stats.st_size
1501            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1502            mtype = mimetypes.guess_type(filename)[0]
1503            headers = email.message_from_string(
1504                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1505                (mtype or 'text/plain', size, modified))
1506            if host:
1507                host, port = _splitport(host)
1508            if not host or \
1509                (not port and _safe_gethostbyname(host) in self.get_names()):
1510                if host:
1511                    origurl = 'file://' + host + filename
1512                else:
1513                    origurl = 'file://' + filename
1514                return addinfourl(open(localfile, 'rb'), headers, origurl)
1515        except OSError as exp:
1516            raise URLError(exp)
1517        raise URLError('file not on local host')
1518
1519def _safe_gethostbyname(host):
1520    try:
1521        return socket.gethostbyname(host)
1522    except socket.gaierror:
1523        return None
1524
1525class FTPHandler(BaseHandler):
1526    def ftp_open(self, req):
1527        import ftplib
1528        import mimetypes
1529        host = req.host
1530        if not host:
1531            raise URLError('ftp error: no host given')
1532        host, port = _splitport(host)
1533        if port is None:
1534            port = ftplib.FTP_PORT
1535        else:
1536            port = int(port)
1537
1538        # username/password handling
1539        user, host = _splituser(host)
1540        if user:
1541            user, passwd = _splitpasswd(user)
1542        else:
1543            passwd = None
1544        host = unquote(host)
1545        user = user or ''
1546        passwd = passwd or ''
1547
1548        try:
1549            host = socket.gethostbyname(host)
1550        except OSError as msg:
1551            raise URLError(msg)
1552        path, attrs = _splitattr(req.selector)
1553        dirs = path.split('/')
1554        dirs = list(map(unquote, dirs))
1555        dirs, file = dirs[:-1], dirs[-1]
1556        if dirs and not dirs[0]:
1557            dirs = dirs[1:]
1558        try:
1559            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1560            type = file and 'I' or 'D'
1561            for attr in attrs:
1562                attr, value = _splitvalue(attr)
1563                if attr.lower() == 'type' and \
1564                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1565                    type = value.upper()
1566            fp, retrlen = fw.retrfile(file, type)
1567            headers = ""
1568            mtype = mimetypes.guess_type(req.full_url)[0]
1569            if mtype:
1570                headers += "Content-type: %s\n" % mtype
1571            if retrlen is not None and retrlen >= 0:
1572                headers += "Content-length: %d\n" % retrlen
1573            headers = email.message_from_string(headers)
1574            return addinfourl(fp, headers, req.full_url)
1575        except ftplib.all_errors as exp:
1576            exc = URLError('ftp error: %r' % exp)
1577            raise exc.with_traceback(sys.exc_info()[2])
1578
1579    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1580        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1581                          persistent=False)
1582
1583class CacheFTPHandler(FTPHandler):
1584    # XXX would be nice to have pluggable cache strategies
1585    # XXX this stuff is definitely not thread safe
1586    def __init__(self):
1587        self.cache = {}
1588        self.timeout = {}
1589        self.soonest = 0
1590        self.delay = 60
1591        self.max_conns = 16
1592
1593    def setTimeout(self, t):
1594        self.delay = t
1595
1596    def setMaxConns(self, m):
1597        self.max_conns = m
1598
1599    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1600        key = user, host, port, '/'.join(dirs), timeout
1601        if key in self.cache:
1602            self.timeout[key] = time.time() + self.delay
1603        else:
1604            self.cache[key] = ftpwrapper(user, passwd, host, port,
1605                                         dirs, timeout)
1606            self.timeout[key] = time.time() + self.delay
1607        self.check_cache()
1608        return self.cache[key]
1609
1610    def check_cache(self):
1611        # first check for old ones
1612        t = time.time()
1613        if self.soonest <= t:
1614            for k, v in list(self.timeout.items()):
1615                if v < t:
1616                    self.cache[k].close()
1617                    del self.cache[k]
1618                    del self.timeout[k]
1619        self.soonest = min(list(self.timeout.values()))
1620
1621        # then check the size
1622        if len(self.cache) == self.max_conns:
1623            for k, v in list(self.timeout.items()):
1624                if v == self.soonest:
1625                    del self.cache[k]
1626                    del self.timeout[k]
1627                    break
1628            self.soonest = min(list(self.timeout.values()))
1629
1630    def clear_cache(self):
1631        for conn in self.cache.values():
1632            conn.close()
1633        self.cache.clear()
1634        self.timeout.clear()
1635
1636class DataHandler(BaseHandler):
1637    def data_open(self, req):
1638        # data URLs as specified in RFC 2397.
1639        #
1640        # ignores POSTed data
1641        #
1642        # syntax:
1643        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1644        # mediatype := [ type "/" subtype ] *( ";" parameter )
1645        # data      := *urlchar
1646        # parameter := attribute "=" value
1647        url = req.full_url
1648
1649        scheme, data = url.split(":",1)
1650        mediatype, data = data.split(",",1)
1651
1652        # even base64 encoded data URLs might be quoted so unquote in any case:
1653        data = unquote_to_bytes(data)
1654        if mediatype.endswith(";base64"):
1655            data = base64.decodebytes(data)
1656            mediatype = mediatype[:-7]
1657
1658        if not mediatype:
1659            mediatype = "text/plain;charset=US-ASCII"
1660
1661        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1662            (mediatype, len(data)))
1663
1664        return addinfourl(io.BytesIO(data), headers, url)
1665
1666
1667# Code move from the old urllib module
1668
1669MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1670
1671# Helper for non-unix systems
1672if os.name == 'nt':
1673    from nturl2path import url2pathname, pathname2url
1674else:
1675    def url2pathname(pathname):
1676        """OS-specific conversion from a relative URL of the 'file' scheme
1677        to a file system path; not recommended for general use."""
1678        return unquote(pathname)
1679
1680    def pathname2url(pathname):
1681        """OS-specific conversion from a file system path to a relative URL
1682        of the 'file' scheme; not recommended for general use."""
1683        return quote(pathname)
1684
1685
1686ftpcache = {}
1687
1688
1689class URLopener:
1690    """Class to open URLs.
1691    This is a class rather than just a subroutine because we may need
1692    more than one set of global protocol-specific options.
1693    Note -- this is a base class for those who don't want the
1694    automatic handling of errors type 302 (relocated) and 401
1695    (authorization needed)."""
1696
1697    __tempfiles = None
1698
1699    version = "Python-urllib/%s" % __version__
1700
1701    # Constructor
1702    def __init__(self, proxies=None, **x509):
1703        msg = "%(class)s style of invoking requests is deprecated. " \
1704              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1705        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1706        if proxies is None:
1707            proxies = getproxies()
1708        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1709        self.proxies = proxies
1710        self.key_file = x509.get('key_file')
1711        self.cert_file = x509.get('cert_file')
1712        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1713        self.__tempfiles = []
1714        self.__unlink = os.unlink # See cleanup()
1715        self.tempcache = None
1716        # Undocumented feature: if you assign {} to tempcache,
1717        # it is used to cache files retrieved with
1718        # self.retrieve().  This is not enabled by default
1719        # since it does not work for changing documents (and I
1720        # haven't got the logic to check expiration headers
1721        # yet).
1722        self.ftpcache = ftpcache
1723        # Undocumented feature: you can use a different
1724        # ftp cache by assigning to the .ftpcache member;
1725        # in case you want logically independent URL openers
1726        # XXX This is not threadsafe.  Bah.
1727
1728    def __del__(self):
1729        self.close()
1730
1731    def close(self):
1732        self.cleanup()
1733
1734    def cleanup(self):
1735        # This code sometimes runs when the rest of this module
1736        # has already been deleted, so it can't use any globals
1737        # or import anything.
1738        if self.__tempfiles:
1739            for file in self.__tempfiles:
1740                try:
1741                    self.__unlink(file)
1742                except OSError:
1743                    pass
1744            del self.__tempfiles[:]
1745        if self.tempcache:
1746            self.tempcache.clear()
1747
1748    def addheader(self, *args):
1749        """Add a header to be used by the HTTP interface only
1750        e.g. u.addheader('Accept', 'sound/basic')"""
1751        self.addheaders.append(args)
1752
1753    # External interface
1754    def open(self, fullurl, data=None):
1755        """Use URLopener().open(file) instead of open(file, 'r')."""
1756        fullurl = unwrap(_to_bytes(fullurl))
1757        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1758        if self.tempcache and fullurl in self.tempcache:
1759            filename, headers = self.tempcache[fullurl]
1760            fp = open(filename, 'rb')
1761            return addinfourl(fp, headers, fullurl)
1762        urltype, url = _splittype(fullurl)
1763        if not urltype:
1764            urltype = 'file'
1765        if urltype in self.proxies:
1766            proxy = self.proxies[urltype]
1767            urltype, proxyhost = _splittype(proxy)
1768            host, selector = _splithost(proxyhost)
1769            url = (host, fullurl) # Signal special case to open_*()
1770        else:
1771            proxy = None
1772        name = 'open_' + urltype
1773        self.type = urltype
1774        name = name.replace('-', '_')
1775        if not hasattr(self, name) or name == 'open_local_file':
1776            if proxy:
1777                return self.open_unknown_proxy(proxy, fullurl, data)
1778            else:
1779                return self.open_unknown(fullurl, data)
1780        try:
1781            if data is None:
1782                return getattr(self, name)(url)
1783            else:
1784                return getattr(self, name)(url, data)
1785        except (HTTPError, URLError):
1786            raise
1787        except OSError as msg:
1788            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1789
1790    def open_unknown(self, fullurl, data=None):
1791        """Overridable interface to open unknown URL type."""
1792        type, url = _splittype(fullurl)
1793        raise OSError('url error', 'unknown url type', type)
1794
1795    def open_unknown_proxy(self, proxy, fullurl, data=None):
1796        """Overridable interface to open unknown URL type."""
1797        type, url = _splittype(fullurl)
1798        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1799
1800    # External interface
1801    def retrieve(self, url, filename=None, reporthook=None, data=None):
1802        """retrieve(url) returns (filename, headers) for a local object
1803        or (tempfilename, headers) for a remote object."""
1804        url = unwrap(_to_bytes(url))
1805        if self.tempcache and url in self.tempcache:
1806            return self.tempcache[url]
1807        type, url1 = _splittype(url)
1808        if filename is None and (not type or type == 'file'):
1809            try:
1810                fp = self.open_local_file(url1)
1811                hdrs = fp.info()
1812                fp.close()
1813                return url2pathname(_splithost(url1)[1]), hdrs
1814            except OSError:
1815                pass
1816        fp = self.open(url, data)
1817        try:
1818            headers = fp.info()
1819            if filename:
1820                tfp = open(filename, 'wb')
1821            else:
1822                garbage, path = _splittype(url)
1823                garbage, path = _splithost(path or "")
1824                path, garbage = _splitquery(path or "")
1825                path, garbage = _splitattr(path or "")
1826                suffix = os.path.splitext(path)[1]
1827                (fd, filename) = tempfile.mkstemp(suffix)
1828                self.__tempfiles.append(filename)
1829                tfp = os.fdopen(fd, 'wb')
1830            try:
1831                result = filename, headers
1832                if self.tempcache is not None:
1833                    self.tempcache[url] = result
1834                bs = 1024*8
1835                size = -1
1836                read = 0
1837                blocknum = 0
1838                if "content-length" in headers:
1839                    size = int(headers["Content-Length"])
1840                if reporthook:
1841                    reporthook(blocknum, bs, size)
1842                while 1:
1843                    block = fp.read(bs)
1844                    if not block:
1845                        break
1846                    read += len(block)
1847                    tfp.write(block)
1848                    blocknum += 1
1849                    if reporthook:
1850                        reporthook(blocknum, bs, size)
1851            finally:
1852                tfp.close()
1853        finally:
1854            fp.close()
1855
1856        # raise exception if actual size does not match content-length header
1857        if size >= 0 and read < size:
1858            raise ContentTooShortError(
1859                "retrieval incomplete: got only %i out of %i bytes"
1860                % (read, size), result)
1861
1862        return result
1863
1864    # Each method named open_<type> knows how to open that type of URL
1865
1866    def _open_generic_http(self, connection_factory, url, data):
1867        """Make an HTTP connection using connection_class.
1868
1869        This is an internal method that should be called from
1870        open_http() or open_https().
1871
1872        Arguments:
1873        - connection_factory should take a host name and return an
1874          HTTPConnection instance.
1875        - url is the url to retrieval or a host, relative-path pair.
1876        - data is payload for a POST request or None.
1877        """
1878
1879        user_passwd = None
1880        proxy_passwd= None
1881        if isinstance(url, str):
1882            host, selector = _splithost(url)
1883            if host:
1884                user_passwd, host = _splituser(host)
1885                host = unquote(host)
1886            realhost = host
1887        else:
1888            host, selector = url
1889            # check whether the proxy contains authorization information
1890            proxy_passwd, host = _splituser(host)
1891            # now we proceed with the url we want to obtain
1892            urltype, rest = _splittype(selector)
1893            url = rest
1894            user_passwd = None
1895            if urltype.lower() != 'http':
1896                realhost = None
1897            else:
1898                realhost, rest = _splithost(rest)
1899                if realhost:
1900                    user_passwd, realhost = _splituser(realhost)
1901                if user_passwd:
1902                    selector = "%s://%s%s" % (urltype, realhost, rest)
1903                if proxy_bypass(realhost):
1904                    host = realhost
1905
1906        if not host: raise OSError('http error', 'no host given')
1907
1908        if proxy_passwd:
1909            proxy_passwd = unquote(proxy_passwd)
1910            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1911        else:
1912            proxy_auth = None
1913
1914        if user_passwd:
1915            user_passwd = unquote(user_passwd)
1916            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1917        else:
1918            auth = None
1919        http_conn = connection_factory(host)
1920        headers = {}
1921        if proxy_auth:
1922            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1923        if auth:
1924            headers["Authorization"] =  "Basic %s" % auth
1925        if realhost:
1926            headers["Host"] = realhost
1927
1928        # Add Connection:close as we don't support persistent connections yet.
1929        # This helps in closing the socket and avoiding ResourceWarning
1930
1931        headers["Connection"] = "close"
1932
1933        for header, value in self.addheaders:
1934            headers[header] = value
1935
1936        if data is not None:
1937            headers["Content-Type"] = "application/x-www-form-urlencoded"
1938            http_conn.request("POST", selector, data, headers)
1939        else:
1940            http_conn.request("GET", selector, headers=headers)
1941
1942        try:
1943            response = http_conn.getresponse()
1944        except http.client.BadStatusLine:
1945            # something went wrong with the HTTP status line
1946            raise URLError("http protocol error: bad status line")
1947
1948        # According to RFC 2616, "2xx" code indicates that the client's
1949        # request was successfully received, understood, and accepted.
1950        if 200 <= response.status < 300:
1951            return addinfourl(response, response.msg, "http:" + url,
1952                              response.status)
1953        else:
1954            return self.http_error(
1955                url, response.fp,
1956                response.status, response.reason, response.msg, data)
1957
1958    def open_http(self, url, data=None):
1959        """Use HTTP protocol."""
1960        return self._open_generic_http(http.client.HTTPConnection, url, data)
1961
1962    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1963        """Handle http errors.
1964
1965        Derived class can override this, or provide specific handlers
1966        named http_error_DDD where DDD is the 3-digit error code."""
1967        # First check if there's a specific handler for this error
1968        name = 'http_error_%d' % errcode
1969        if hasattr(self, name):
1970            method = getattr(self, name)
1971            if data is None:
1972                result = method(url, fp, errcode, errmsg, headers)
1973            else:
1974                result = method(url, fp, errcode, errmsg, headers, data)
1975            if result: return result
1976        return self.http_error_default(url, fp, errcode, errmsg, headers)
1977
1978    def http_error_default(self, url, fp, errcode, errmsg, headers):
1979        """Default error handler: close the connection and raise OSError."""
1980        fp.close()
1981        raise HTTPError(url, errcode, errmsg, headers, None)
1982
1983    if _have_ssl:
1984        def _https_connection(self, host):
1985            return http.client.HTTPSConnection(host,
1986                                           key_file=self.key_file,
1987                                           cert_file=self.cert_file)
1988
1989        def open_https(self, url, data=None):
1990            """Use HTTPS protocol."""
1991            return self._open_generic_http(self._https_connection, url, data)
1992
1993    def open_file(self, url):
1994        """Use local file or FTP depending on form of URL."""
1995        if not isinstance(url, str):
1996            raise URLError('file error: proxy support for file protocol currently not implemented')
1997        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1998            raise ValueError("file:// scheme is supported only on localhost")
1999        else:
2000            return self.open_local_file(url)
2001
2002    def open_local_file(self, url):
2003        """Use local file."""
2004        import email.utils
2005        import mimetypes
2006        host, file = _splithost(url)
2007        localname = url2pathname(file)
2008        try:
2009            stats = os.stat(localname)
2010        except OSError as e:
2011            raise URLError(e.strerror, e.filename)
2012        size = stats.st_size
2013        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2014        mtype = mimetypes.guess_type(url)[0]
2015        headers = email.message_from_string(
2016            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2017            (mtype or 'text/plain', size, modified))
2018        if not host:
2019            urlfile = file
2020            if file[:1] == '/':
2021                urlfile = 'file://' + file
2022            return addinfourl(open(localname, 'rb'), headers, urlfile)
2023        host, port = _splitport(host)
2024        if (not port
2025           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2026            urlfile = file
2027            if file[:1] == '/':
2028                urlfile = 'file://' + file
2029            elif file[:2] == './':
2030                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2031            return addinfourl(open(localname, 'rb'), headers, urlfile)
2032        raise URLError('local file error: not on local host')
2033
2034    def open_ftp(self, url):
2035        """Use FTP protocol."""
2036        if not isinstance(url, str):
2037            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2038        import mimetypes
2039        host, path = _splithost(url)
2040        if not host: raise URLError('ftp error: no host given')
2041        host, port = _splitport(host)
2042        user, host = _splituser(host)
2043        if user: user, passwd = _splitpasswd(user)
2044        else: passwd = None
2045        host = unquote(host)
2046        user = unquote(user or '')
2047        passwd = unquote(passwd or '')
2048        host = socket.gethostbyname(host)
2049        if not port:
2050            import ftplib
2051            port = ftplib.FTP_PORT
2052        else:
2053            port = int(port)
2054        path, attrs = _splitattr(path)
2055        path = unquote(path)
2056        dirs = path.split('/')
2057        dirs, file = dirs[:-1], dirs[-1]
2058        if dirs and not dirs[0]: dirs = dirs[1:]
2059        if dirs and not dirs[0]: dirs[0] = '/'
2060        key = user, host, port, '/'.join(dirs)
2061        # XXX thread unsafe!
2062        if len(self.ftpcache) > MAXFTPCACHE:
2063            # Prune the cache, rather arbitrarily
2064            for k in list(self.ftpcache):
2065                if k != key:
2066                    v = self.ftpcache[k]
2067                    del self.ftpcache[k]
2068                    v.close()
2069        try:
2070            if key not in self.ftpcache:
2071                self.ftpcache[key] = \
2072                    ftpwrapper(user, passwd, host, port, dirs)
2073            if not file: type = 'D'
2074            else: type = 'I'
2075            for attr in attrs:
2076                attr, value = _splitvalue(attr)
2077                if attr.lower() == 'type' and \
2078                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2079                    type = value.upper()
2080            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2081            mtype = mimetypes.guess_type("ftp:" + url)[0]
2082            headers = ""
2083            if mtype:
2084                headers += "Content-Type: %s\n" % mtype
2085            if retrlen is not None and retrlen >= 0:
2086                headers += "Content-Length: %d\n" % retrlen
2087            headers = email.message_from_string(headers)
2088            return addinfourl(fp, headers, "ftp:" + url)
2089        except ftperrors() as exp:
2090            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2091
2092    def open_data(self, url, data=None):
2093        """Use "data" URL."""
2094        if not isinstance(url, str):
2095            raise URLError('data error: proxy support for data protocol currently not implemented')
2096        # ignore POSTed data
2097        #
2098        # syntax of data URLs:
2099        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2100        # mediatype := [ type "/" subtype ] *( ";" parameter )
2101        # data      := *urlchar
2102        # parameter := attribute "=" value
2103        try:
2104            [type, data] = url.split(',', 1)
2105        except ValueError:
2106            raise OSError('data error', 'bad data URL')
2107        if not type:
2108            type = 'text/plain;charset=US-ASCII'
2109        semi = type.rfind(';')
2110        if semi >= 0 and '=' not in type[semi:]:
2111            encoding = type[semi+1:]
2112            type = type[:semi]
2113        else:
2114            encoding = ''
2115        msg = []
2116        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2117                                            time.gmtime(time.time())))
2118        msg.append('Content-type: %s' % type)
2119        if encoding == 'base64':
2120            # XXX is this encoding/decoding ok?
2121            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2122        else:
2123            data = unquote(data)
2124        msg.append('Content-Length: %d' % len(data))
2125        msg.append('')
2126        msg.append(data)
2127        msg = '\n'.join(msg)
2128        headers = email.message_from_string(msg)
2129        f = io.StringIO(msg)
2130        #f.fileno = None     # needed for addinfourl
2131        return addinfourl(f, headers, url)
2132
2133
2134class FancyURLopener(URLopener):
2135    """Derived class with handlers for errors we can handle (perhaps)."""
2136
2137    def __init__(self, *args, **kwargs):
2138        URLopener.__init__(self, *args, **kwargs)
2139        self.auth_cache = {}
2140        self.tries = 0
2141        self.maxtries = 10
2142
2143    def http_error_default(self, url, fp, errcode, errmsg, headers):
2144        """Default error handling -- don't raise an exception."""
2145        return addinfourl(fp, headers, "http:" + url, errcode)
2146
2147    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2148        """Error 302 -- relocated (temporarily)."""
2149        self.tries += 1
2150        try:
2151            if self.maxtries and self.tries >= self.maxtries:
2152                if hasattr(self, "http_error_500"):
2153                    meth = self.http_error_500
2154                else:
2155                    meth = self.http_error_default
2156                return meth(url, fp, 500,
2157                            "Internal Server Error: Redirect Recursion",
2158                            headers)
2159            result = self.redirect_internal(url, fp, errcode, errmsg,
2160                                            headers, data)
2161            return result
2162        finally:
2163            self.tries = 0
2164
2165    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2166        if 'location' in headers:
2167            newurl = headers['location']
2168        elif 'uri' in headers:
2169            newurl = headers['uri']
2170        else:
2171            return
2172        fp.close()
2173
2174        # In case the server sent a relative URL, join with original:
2175        newurl = urljoin(self.type + ":" + url, newurl)
2176
2177        urlparts = urlparse(newurl)
2178
2179        # For security reasons, we don't allow redirection to anything other
2180        # than http, https and ftp.
2181
2182        # We are using newer HTTPError with older redirect_internal method
2183        # This older method will get deprecated in 3.3
2184
2185        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2186            raise HTTPError(newurl, errcode,
2187                            errmsg +
2188                            " Redirection to url '%s' is not allowed." % newurl,
2189                            headers, fp)
2190
2191        return self.open(newurl)
2192
2193    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2194        """Error 301 -- also relocated (permanently)."""
2195        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2196
2197    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2198        """Error 303 -- also relocated (essentially identical to 302)."""
2199        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2200
2201    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2202        """Error 307 -- relocated, but turn POST into error."""
2203        if data is None:
2204            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2205        else:
2206            return self.http_error_default(url, fp, errcode, errmsg, headers)
2207
2208    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2209            retry=False):
2210        """Error 401 -- authentication required.
2211        This function supports Basic authentication only."""
2212        if 'www-authenticate' not in headers:
2213            URLopener.http_error_default(self, url, fp,
2214                                         errcode, errmsg, headers)
2215        stuff = headers['www-authenticate']
2216        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2217        if not match:
2218            URLopener.http_error_default(self, url, fp,
2219                                         errcode, errmsg, headers)
2220        scheme, realm = match.groups()
2221        if scheme.lower() != 'basic':
2222            URLopener.http_error_default(self, url, fp,
2223                                         errcode, errmsg, headers)
2224        if not retry:
2225            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2226                    headers)
2227        name = 'retry_' + self.type + '_basic_auth'
2228        if data is None:
2229            return getattr(self,name)(url, realm)
2230        else:
2231            return getattr(self,name)(url, realm, data)
2232
2233    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2234            retry=False):
2235        """Error 407 -- proxy authentication required.
2236        This function supports Basic authentication only."""
2237        if 'proxy-authenticate' not in headers:
2238            URLopener.http_error_default(self, url, fp,
2239                                         errcode, errmsg, headers)
2240        stuff = headers['proxy-authenticate']
2241        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2242        if not match:
2243            URLopener.http_error_default(self, url, fp,
2244                                         errcode, errmsg, headers)
2245        scheme, realm = match.groups()
2246        if scheme.lower() != 'basic':
2247            URLopener.http_error_default(self, url, fp,
2248                                         errcode, errmsg, headers)
2249        if not retry:
2250            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2251                    headers)
2252        name = 'retry_proxy_' + self.type + '_basic_auth'
2253        if data is None:
2254            return getattr(self,name)(url, realm)
2255        else:
2256            return getattr(self,name)(url, realm, data)
2257
2258    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2259        host, selector = _splithost(url)
2260        newurl = 'http://' + host + selector
2261        proxy = self.proxies['http']
2262        urltype, proxyhost = _splittype(proxy)
2263        proxyhost, proxyselector = _splithost(proxyhost)
2264        i = proxyhost.find('@') + 1
2265        proxyhost = proxyhost[i:]
2266        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2267        if not (user or passwd): return None
2268        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2269                                  quote(passwd, safe=''), proxyhost)
2270        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2271        if data is None:
2272            return self.open(newurl)
2273        else:
2274            return self.open(newurl, data)
2275
2276    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2277        host, selector = _splithost(url)
2278        newurl = 'https://' + host + selector
2279        proxy = self.proxies['https']
2280        urltype, proxyhost = _splittype(proxy)
2281        proxyhost, proxyselector = _splithost(proxyhost)
2282        i = proxyhost.find('@') + 1
2283        proxyhost = proxyhost[i:]
2284        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2285        if not (user or passwd): return None
2286        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2287                                  quote(passwd, safe=''), proxyhost)
2288        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2289        if data is None:
2290            return self.open(newurl)
2291        else:
2292            return self.open(newurl, data)
2293
2294    def retry_http_basic_auth(self, url, realm, data=None):
2295        host, selector = _splithost(url)
2296        i = host.find('@') + 1
2297        host = host[i:]
2298        user, passwd = self.get_user_passwd(host, realm, i)
2299        if not (user or passwd): return None
2300        host = "%s:%s@%s" % (quote(user, safe=''),
2301                             quote(passwd, safe=''), host)
2302        newurl = 'http://' + host + selector
2303        if data is None:
2304            return self.open(newurl)
2305        else:
2306            return self.open(newurl, data)
2307
2308    def retry_https_basic_auth(self, url, realm, data=None):
2309        host, selector = _splithost(url)
2310        i = host.find('@') + 1
2311        host = host[i:]
2312        user, passwd = self.get_user_passwd(host, realm, i)
2313        if not (user or passwd): return None
2314        host = "%s:%s@%s" % (quote(user, safe=''),
2315                             quote(passwd, safe=''), host)
2316        newurl = 'https://' + host + selector
2317        if data is None:
2318            return self.open(newurl)
2319        else:
2320            return self.open(newurl, data)
2321
2322    def get_user_passwd(self, host, realm, clear_cache=0):
2323        key = realm + '@' + host.lower()
2324        if key in self.auth_cache:
2325            if clear_cache:
2326                del self.auth_cache[key]
2327            else:
2328                return self.auth_cache[key]
2329        user, passwd = self.prompt_user_passwd(host, realm)
2330        if user or passwd: self.auth_cache[key] = (user, passwd)
2331        return user, passwd
2332
2333    def prompt_user_passwd(self, host, realm):
2334        """Override this in a GUI environment!"""
2335        import getpass
2336        try:
2337            user = input("Enter username for %s at %s: " % (realm, host))
2338            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2339                (user, realm, host))
2340            return user, passwd
2341        except KeyboardInterrupt:
2342            print()
2343            return None, None
2344
2345
2346# Utility functions
2347
2348_localhost = None
2349def localhost():
2350    """Return the IP address of the magic hostname 'localhost'."""
2351    global _localhost
2352    if _localhost is None:
2353        _localhost = socket.gethostbyname('localhost')
2354    return _localhost
2355
2356_thishost = None
2357def thishost():
2358    """Return the IP addresses of the current host."""
2359    global _thishost
2360    if _thishost is None:
2361        try:
2362            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2363        except socket.gaierror:
2364            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2365    return _thishost
2366
2367_ftperrors = None
2368def ftperrors():
2369    """Return the set of errors raised by the FTP class."""
2370    global _ftperrors
2371    if _ftperrors is None:
2372        import ftplib
2373        _ftperrors = ftplib.all_errors
2374    return _ftperrors
2375
2376_noheaders = None
2377def noheaders():
2378    """Return an empty email Message object."""
2379    global _noheaders
2380    if _noheaders is None:
2381        _noheaders = email.message_from_string("")
2382    return _noheaders
2383
2384
2385# Utility classes
2386
2387class ftpwrapper:
2388    """Class used by open_ftp() for cache of open FTP connections."""
2389
2390    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2391                 persistent=True):
2392        self.user = user
2393        self.passwd = passwd
2394        self.host = host
2395        self.port = port
2396        self.dirs = dirs
2397        self.timeout = timeout
2398        self.refcount = 0
2399        self.keepalive = persistent
2400        try:
2401            self.init()
2402        except:
2403            self.close()
2404            raise
2405
2406    def init(self):
2407        import ftplib
2408        self.busy = 0
2409        self.ftp = ftplib.FTP()
2410        self.ftp.connect(self.host, self.port, self.timeout)
2411        self.ftp.login(self.user, self.passwd)
2412        _target = '/'.join(self.dirs)
2413        self.ftp.cwd(_target)
2414
2415    def retrfile(self, file, type):
2416        import ftplib
2417        self.endtransfer()
2418        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2419        else: cmd = 'TYPE ' + type; isdir = 0
2420        try:
2421            self.ftp.voidcmd(cmd)
2422        except ftplib.all_errors:
2423            self.init()
2424            self.ftp.voidcmd(cmd)
2425        conn = None
2426        if file and not isdir:
2427            # Try to retrieve as a file
2428            try:
2429                cmd = 'RETR ' + file
2430                conn, retrlen = self.ftp.ntransfercmd(cmd)
2431            except ftplib.error_perm as reason:
2432                if str(reason)[:3] != '550':
2433                    raise URLError('ftp error: %r' % reason).with_traceback(
2434                        sys.exc_info()[2])
2435        if not conn:
2436            # Set transfer mode to ASCII!
2437            self.ftp.voidcmd('TYPE A')
2438            # Try a directory listing. Verify that directory exists.
2439            if file:
2440                pwd = self.ftp.pwd()
2441                try:
2442                    try:
2443                        self.ftp.cwd(file)
2444                    except ftplib.error_perm as reason:
2445                        raise URLError('ftp error: %r' % reason) from reason
2446                finally:
2447                    self.ftp.cwd(pwd)
2448                cmd = 'LIST ' + file
2449            else:
2450                cmd = 'LIST'
2451            conn, retrlen = self.ftp.ntransfercmd(cmd)
2452        self.busy = 1
2453
2454        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2455        self.refcount += 1
2456        conn.close()
2457        # Pass back both a suitably decorated object and a retrieval length
2458        return (ftpobj, retrlen)
2459
2460    def endtransfer(self):
2461        self.busy = 0
2462
2463    def close(self):
2464        self.keepalive = False
2465        if self.refcount <= 0:
2466            self.real_close()
2467
2468    def file_close(self):
2469        self.endtransfer()
2470        self.refcount -= 1
2471        if self.refcount <= 0 and not self.keepalive:
2472            self.real_close()
2473
2474    def real_close(self):
2475        self.endtransfer()
2476        try:
2477            self.ftp.close()
2478        except ftperrors():
2479            pass
2480
2481# Proxy handling
2482def getproxies_environment():
2483    """Return a dictionary of scheme -> proxy server URL mappings.
2484
2485    Scan the environment for variables named <scheme>_proxy;
2486    this seems to be the standard convention.  If you need a
2487    different way, you can pass a proxies dictionary to the
2488    [Fancy]URLopener constructor.
2489
2490    """
2491    proxies = {}
2492    # in order to prefer lowercase variables, process environment in
2493    # two passes: first matches any, second pass matches lowercase only
2494    for name, value in os.environ.items():
2495        name = name.lower()
2496        if value and name[-6:] == '_proxy':
2497            proxies[name[:-6]] = value
2498    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2499    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2500    # header from the client
2501    # If "proxy" is lowercase, it will still be used thanks to the next block
2502    if 'REQUEST_METHOD' in os.environ:
2503        proxies.pop('http', None)
2504    for name, value in os.environ.items():
2505        if name[-6:] == '_proxy':
2506            name = name.lower()
2507            if value:
2508                proxies[name[:-6]] = value
2509            else:
2510                proxies.pop(name[:-6], None)
2511    return proxies
2512
2513def proxy_bypass_environment(host, proxies=None):
2514    """Test if proxies should not be used for a particular host.
2515
2516    Checks the proxy dict for the value of no_proxy, which should
2517    be a list of comma separated DNS suffixes, or '*' for all hosts.
2518
2519    """
2520    if proxies is None:
2521        proxies = getproxies_environment()
2522    # don't bypass, if no_proxy isn't specified
2523    try:
2524        no_proxy = proxies['no']
2525    except KeyError:
2526        return False
2527    # '*' is special case for always bypass
2528    if no_proxy == '*':
2529        return True
2530    host = host.lower()
2531    # strip port off host
2532    hostonly, port = _splitport(host)
2533    # check if the host ends with any of the DNS suffixes
2534    for name in no_proxy.split(','):
2535        name = name.strip()
2536        if name:
2537            name = name.lstrip('.')  # ignore leading dots
2538            name = name.lower()
2539            if hostonly == name or host == name:
2540                return True
2541            name = '.' + name
2542            if hostonly.endswith(name) or host.endswith(name):
2543                return True
2544    # otherwise, don't bypass
2545    return False
2546
2547
2548# This code tests an OSX specific data structure but is testable on all
2549# platforms
2550def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2551    """
2552    Return True iff this host shouldn't be accessed using a proxy
2553
2554    This function uses the MacOSX framework SystemConfiguration
2555    to fetch the proxy information.
2556
2557    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2558    { 'exclude_simple': bool,
2559      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2560    }
2561    """
2562    from fnmatch import fnmatch
2563
2564    hostonly, port = _splitport(host)
2565
2566    def ip2num(ipAddr):
2567        parts = ipAddr.split('.')
2568        parts = list(map(int, parts))
2569        if len(parts) != 4:
2570            parts = (parts + [0, 0, 0, 0])[:4]
2571        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2572
2573    # Check for simple host names:
2574    if '.' not in host:
2575        if proxy_settings['exclude_simple']:
2576            return True
2577
2578    hostIP = None
2579
2580    for value in proxy_settings.get('exceptions', ()):
2581        # Items in the list are strings like these: *.local, 169.254/16
2582        if not value: continue
2583
2584        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2585        if m is not None:
2586            if hostIP is None:
2587                try:
2588                    hostIP = socket.gethostbyname(hostonly)
2589                    hostIP = ip2num(hostIP)
2590                except OSError:
2591                    continue
2592
2593            base = ip2num(m.group(1))
2594            mask = m.group(2)
2595            if mask is None:
2596                mask = 8 * (m.group(1).count('.') + 1)
2597            else:
2598                mask = int(mask[1:])
2599
2600            if mask < 0 or mask > 32:
2601                # System libraries ignore invalid prefix lengths
2602                continue
2603
2604            mask = 32 - mask
2605
2606            if (hostIP >> mask) == (base >> mask):
2607                return True
2608
2609        elif fnmatch(host, value):
2610            return True
2611
2612    return False
2613
2614
2615if sys.platform == 'darwin':
2616    from _scproxy import _get_proxy_settings, _get_proxies
2617
2618    def proxy_bypass_macosx_sysconf(host):
2619        proxy_settings = _get_proxy_settings()
2620        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2621
2622    def getproxies_macosx_sysconf():
2623        """Return a dictionary of scheme -> proxy server URL mappings.
2624
2625        This function uses the MacOSX framework SystemConfiguration
2626        to fetch the proxy information.
2627        """
2628        return _get_proxies()
2629
2630
2631
2632    def proxy_bypass(host):
2633        """Return True, if host should be bypassed.
2634
2635        Checks proxy settings gathered from the environment, if specified,
2636        or from the MacOSX framework SystemConfiguration.
2637
2638        """
2639        proxies = getproxies_environment()
2640        if proxies:
2641            return proxy_bypass_environment(host, proxies)
2642        else:
2643            return proxy_bypass_macosx_sysconf(host)
2644
2645    def getproxies():
2646        return getproxies_environment() or getproxies_macosx_sysconf()
2647
2648
2649elif os.name == 'nt':
2650    def getproxies_registry():
2651        """Return a dictionary of scheme -> proxy server URL mappings.
2652
2653        Win32 uses the registry to store proxies.
2654
2655        """
2656        proxies = {}
2657        try:
2658            import winreg
2659        except ImportError:
2660            # Std module, so should be around - but you never know!
2661            return proxies
2662        try:
2663            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2664                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2665            proxyEnable = winreg.QueryValueEx(internetSettings,
2666                                               'ProxyEnable')[0]
2667            if proxyEnable:
2668                # Returned as Unicode but problems if not converted to ASCII
2669                proxyServer = str(winreg.QueryValueEx(internetSettings,
2670                                                       'ProxyServer')[0])
2671                if '=' in proxyServer:
2672                    # Per-protocol settings
2673                    for p in proxyServer.split(';'):
2674                        protocol, address = p.split('=', 1)
2675                        # See if address has a type:// prefix
2676                        if not re.match('(?:[^/:]+)://', address):
2677                            address = '%s://%s' % (protocol, address)
2678                        proxies[protocol] = address
2679                else:
2680                    # Use one setting for all protocols
2681                    if proxyServer[:5] == 'http:':
2682                        proxies['http'] = proxyServer
2683                    else:
2684                        proxies['http'] = 'http://%s' % proxyServer
2685                        proxies['https'] = 'https://%s' % proxyServer
2686                        proxies['ftp'] = 'ftp://%s' % proxyServer
2687            internetSettings.Close()
2688        except (OSError, ValueError, TypeError):
2689            # Either registry key not found etc, or the value in an
2690            # unexpected format.
2691            # proxies already set up to be empty so nothing to do
2692            pass
2693        return proxies
2694
2695    def getproxies():
2696        """Return a dictionary of scheme -> proxy server URL mappings.
2697
2698        Returns settings gathered from the environment, if specified,
2699        or the registry.
2700
2701        """
2702        return getproxies_environment() or getproxies_registry()
2703
2704    def proxy_bypass_registry(host):
2705        try:
2706            import winreg
2707        except ImportError:
2708            # Std modules, so should be around - but you never know!
2709            return 0
2710        try:
2711            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2712                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2713            proxyEnable = winreg.QueryValueEx(internetSettings,
2714                                               'ProxyEnable')[0]
2715            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2716                                                     'ProxyOverride')[0])
2717            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2718        except OSError:
2719            return 0
2720        if not proxyEnable or not proxyOverride:
2721            return 0
2722        # try to make a host list from name and IP address.
2723        rawHost, port = _splitport(host)
2724        host = [rawHost]
2725        try:
2726            addr = socket.gethostbyname(rawHost)
2727            if addr != rawHost:
2728                host.append(addr)
2729        except OSError:
2730            pass
2731        try:
2732            fqdn = socket.getfqdn(rawHost)
2733            if fqdn != rawHost:
2734                host.append(fqdn)
2735        except OSError:
2736            pass
2737        # make a check value list from the registry entry: replace the
2738        # '<local>' string by the localhost entry and the corresponding
2739        # canonical entry.
2740        proxyOverride = proxyOverride.split(';')
2741        # now check if we match one of the registry values.
2742        for test in proxyOverride:
2743            if test == '<local>':
2744                if '.' not in rawHost:
2745                    return 1
2746            test = test.replace(".", r"\.")     # mask dots
2747            test = test.replace("*", r".*")     # change glob sequence
2748            test = test.replace("?", r".")      # change glob char
2749            for val in host:
2750                if re.match(test, val, re.I):
2751                    return 1
2752        return 0
2753
2754    def proxy_bypass(host):
2755        """Return True, if host should be bypassed.
2756
2757        Checks proxy settings gathered from the environment, if specified,
2758        or the registry.
2759
2760        """
2761        proxies = getproxies_environment()
2762        if proxies:
2763            return proxy_bypass_environment(host, proxies)
2764        else:
2765            return proxy_bypass_registry(host)
2766
2767else:
2768    # By default use environment variables
2769    getproxies = getproxies_environment
2770    proxy_bypass = proxy_bypass_environment
2771