• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166    This function always returns an object which can work as a context
167    manager and has methods such as
168
169    * geturl() - return the URL of the resource retrieved, commonly used to
170      determine if a redirect was followed
171
172    * info() - return the meta-information of the page, such as headers, in the
173      form of an email.message_from_string() instance (see Quick Reference to
174      HTTP Headers)
175
176    * getcode() - return the HTTP status code of the response.  Raises URLError
177      on errors.
178
179    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
180    object slightly modified. In addition to the three new methods above, the
181    msg attribute contains the same information as the reason attribute ---
182    the reason phrase returned by the server --- instead of the response
183    headers as it is specified in the documentation for HTTPResponse.
184
185    For FTP, file, and data URLs and requests explicitly handled by legacy
186    URLopener and FancyURLopener classes, this function returns a
187    urllib.response.addinfourl object.
188
189    Note that None may be returned if no handler handles the request (though
190    the default installed global OpenerDirector uses UnknownHandler to ensure
191    this never happens).
192
193    In addition, if proxy settings are detected (for example, when a *_proxy
194    environment variable like http_proxy is set), ProxyHandler is default
195    installed and makes sure the requests are handled through the proxy.
196
197    '''
198    global _opener
199    if cafile or capath or cadefault:
200        import warnings
201        warnings.warn("cafile, capath and cadefault are deprecated, use a "
202                      "custom context instead.", DeprecationWarning, 2)
203        if context is not None:
204            raise ValueError(
205                "You can't pass both context and any of cafile, capath, and "
206                "cadefault"
207            )
208        if not _have_ssl:
209            raise ValueError('SSL support not available')
210        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
211                                             cafile=cafile,
212                                             capath=capath)
213        https_handler = HTTPSHandler(context=context)
214        opener = build_opener(https_handler)
215    elif context:
216        https_handler = HTTPSHandler(context=context)
217        opener = build_opener(https_handler)
218    elif _opener is None:
219        _opener = opener = build_opener()
220    else:
221        opener = _opener
222    return opener.open(url, data, timeout)
223
224def install_opener(opener):
225    global _opener
226    _opener = opener
227
228_url_tempfiles = []
229def urlretrieve(url, filename=None, reporthook=None, data=None):
230    """
231    Retrieve a URL into a temporary location on disk.
232
233    Requires a URL argument. If a filename is passed, it is used as
234    the temporary file location. The reporthook argument should be
235    a callable that accepts a block number, a read size, and the
236    total file size of the URL target. The data argument should be
237    valid URL encoded data.
238
239    If a filename is passed and the URL points to a local resource,
240    the result is a copy from local file to new file.
241
242    Returns a tuple containing the path to the newly created
243    data file as well as the resulting HTTPMessage object.
244    """
245    url_type, path = _splittype(url)
246
247    with contextlib.closing(urlopen(url, data)) as fp:
248        headers = fp.info()
249
250        # Just return the local path and the "headers" for file://
251        # URLs. No sense in performing a copy unless requested.
252        if url_type == "file" and not filename:
253            return os.path.normpath(path), headers
254
255        # Handle temporary file setup.
256        if filename:
257            tfp = open(filename, 'wb')
258        else:
259            tfp = tempfile.NamedTemporaryFile(delete=False)
260            filename = tfp.name
261            _url_tempfiles.append(filename)
262
263        with tfp:
264            result = filename, headers
265            bs = 1024*8
266            size = -1
267            read = 0
268            blocknum = 0
269            if "content-length" in headers:
270                size = int(headers["Content-Length"])
271
272            if reporthook:
273                reporthook(blocknum, bs, size)
274
275            while True:
276                block = fp.read(bs)
277                if not block:
278                    break
279                read += len(block)
280                tfp.write(block)
281                blocknum += 1
282                if reporthook:
283                    reporthook(blocknum, bs, size)
284
285    if size >= 0 and read < size:
286        raise ContentTooShortError(
287            "retrieval incomplete: got only %i out of %i bytes"
288            % (read, size), result)
289
290    return result
291
292def urlcleanup():
293    """Clean up temporary files from urlretrieve calls."""
294    for temp_file in _url_tempfiles:
295        try:
296            os.unlink(temp_file)
297        except OSError:
298            pass
299
300    del _url_tempfiles[:]
301    global _opener
302    if _opener:
303        _opener = None
304
305# copied from cookielib.py
306_cut_port_re = re.compile(r":\d+$", re.ASCII)
307def request_host(request):
308    """Return request-host, as defined by RFC 2965.
309
310    Variation from RFC: returned value is lowercased, for convenient
311    comparison.
312
313    """
314    url = request.full_url
315    host = urlparse(url)[1]
316    if host == "":
317        host = request.get_header("Host", "")
318
319    # remove port, if present
320    host = _cut_port_re.sub("", host, 1)
321    return host.lower()
322
323class Request:
324
325    def __init__(self, url, data=None, headers={},
326                 origin_req_host=None, unverifiable=False,
327                 method=None):
328        self.full_url = url
329        self.headers = {}
330        self.unredirected_hdrs = {}
331        self._data = None
332        self.data = data
333        self._tunnel_host = None
334        for key, value in headers.items():
335            self.add_header(key, value)
336        if origin_req_host is None:
337            origin_req_host = request_host(self)
338        self.origin_req_host = origin_req_host
339        self.unverifiable = unverifiable
340        if method:
341            self.method = method
342
343    @property
344    def full_url(self):
345        if self.fragment:
346            return '{}#{}'.format(self._full_url, self.fragment)
347        return self._full_url
348
349    @full_url.setter
350    def full_url(self, url):
351        # unwrap('<URL:type://host/path>') --> 'type://host/path'
352        self._full_url = unwrap(url)
353        self._full_url, self.fragment = _splittag(self._full_url)
354        self._parse()
355
356    @full_url.deleter
357    def full_url(self):
358        self._full_url = None
359        self.fragment = None
360        self.selector = ''
361
362    @property
363    def data(self):
364        return self._data
365
366    @data.setter
367    def data(self, data):
368        if data != self._data:
369            self._data = data
370            # issue 16464
371            # if we change data we need to remove content-length header
372            # (cause it's most probably calculated for previous value)
373            if self.has_header("Content-length"):
374                self.remove_header("Content-length")
375
376    @data.deleter
377    def data(self):
378        self.data = None
379
380    def _parse(self):
381        self.type, rest = _splittype(self._full_url)
382        if self.type is None:
383            raise ValueError("unknown url type: %r" % self.full_url)
384        self.host, self.selector = _splithost(rest)
385        if self.host:
386            self.host = unquote(self.host)
387
388    def get_method(self):
389        """Return a string indicating the HTTP request method."""
390        default_method = "POST" if self.data is not None else "GET"
391        return getattr(self, 'method', default_method)
392
393    def get_full_url(self):
394        return self.full_url
395
396    def set_proxy(self, host, type):
397        if self.type == 'https' and not self._tunnel_host:
398            self._tunnel_host = self.host
399        else:
400            self.type= type
401            self.selector = self.full_url
402        self.host = host
403
404    def has_proxy(self):
405        return self.selector == self.full_url
406
407    def add_header(self, key, val):
408        # useful for something like authentication
409        self.headers[key.capitalize()] = val
410
411    def add_unredirected_header(self, key, val):
412        # will not be added to a redirected request
413        self.unredirected_hdrs[key.capitalize()] = val
414
415    def has_header(self, header_name):
416        return (header_name in self.headers or
417                header_name in self.unredirected_hdrs)
418
419    def get_header(self, header_name, default=None):
420        return self.headers.get(
421            header_name,
422            self.unredirected_hdrs.get(header_name, default))
423
424    def remove_header(self, header_name):
425        self.headers.pop(header_name, None)
426        self.unredirected_hdrs.pop(header_name, None)
427
428    def header_items(self):
429        hdrs = {**self.unredirected_hdrs, **self.headers}
430        return list(hdrs.items())
431
432class OpenerDirector:
433    def __init__(self):
434        client_version = "Python-urllib/%s" % __version__
435        self.addheaders = [('User-agent', client_version)]
436        # self.handlers is retained only for backward compatibility
437        self.handlers = []
438        # manage the individual handlers
439        self.handle_open = {}
440        self.handle_error = {}
441        self.process_response = {}
442        self.process_request = {}
443
444    def add_handler(self, handler):
445        if not hasattr(handler, "add_parent"):
446            raise TypeError("expected BaseHandler instance, got %r" %
447                            type(handler))
448
449        added = False
450        for meth in dir(handler):
451            if meth in ["redirect_request", "do_open", "proxy_open"]:
452                # oops, coincidental match
453                continue
454
455            i = meth.find("_")
456            protocol = meth[:i]
457            condition = meth[i+1:]
458
459            if condition.startswith("error"):
460                j = condition.find("_") + i + 1
461                kind = meth[j+1:]
462                try:
463                    kind = int(kind)
464                except ValueError:
465                    pass
466                lookup = self.handle_error.get(protocol, {})
467                self.handle_error[protocol] = lookup
468            elif condition == "open":
469                kind = protocol
470                lookup = self.handle_open
471            elif condition == "response":
472                kind = protocol
473                lookup = self.process_response
474            elif condition == "request":
475                kind = protocol
476                lookup = self.process_request
477            else:
478                continue
479
480            handlers = lookup.setdefault(kind, [])
481            if handlers:
482                bisect.insort(handlers, handler)
483            else:
484                handlers.append(handler)
485            added = True
486
487        if added:
488            bisect.insort(self.handlers, handler)
489            handler.add_parent(self)
490
491    def close(self):
492        # Only exists for backwards compatibility.
493        pass
494
495    def _call_chain(self, chain, kind, meth_name, *args):
496        # Handlers raise an exception if no one else should try to handle
497        # the request, or return None if they can't but another handler
498        # could.  Otherwise, they return the response.
499        handlers = chain.get(kind, ())
500        for handler in handlers:
501            func = getattr(handler, meth_name)
502            result = func(*args)
503            if result is not None:
504                return result
505
506    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
507        # accept a URL or a Request object
508        if isinstance(fullurl, str):
509            req = Request(fullurl, data)
510        else:
511            req = fullurl
512            if data is not None:
513                req.data = data
514
515        req.timeout = timeout
516        protocol = req.type
517
518        # pre-process request
519        meth_name = protocol+"_request"
520        for processor in self.process_request.get(protocol, []):
521            meth = getattr(processor, meth_name)
522            req = meth(req)
523
524        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
525        response = self._open(req, data)
526
527        # post-process response
528        meth_name = protocol+"_response"
529        for processor in self.process_response.get(protocol, []):
530            meth = getattr(processor, meth_name)
531            response = meth(req, response)
532
533        return response
534
535    def _open(self, req, data=None):
536        result = self._call_chain(self.handle_open, 'default',
537                                  'default_open', req)
538        if result:
539            return result
540
541        protocol = req.type
542        result = self._call_chain(self.handle_open, protocol, protocol +
543                                  '_open', req)
544        if result:
545            return result
546
547        return self._call_chain(self.handle_open, 'unknown',
548                                'unknown_open', req)
549
550    def error(self, proto, *args):
551        if proto in ('http', 'https'):
552            # XXX http[s] protocols are special-cased
553            dict = self.handle_error['http'] # https is not different than http
554            proto = args[2]  # YUCK!
555            meth_name = 'http_error_%s' % proto
556            http_err = 1
557            orig_args = args
558        else:
559            dict = self.handle_error
560            meth_name = proto + '_error'
561            http_err = 0
562        args = (dict, proto, meth_name) + args
563        result = self._call_chain(*args)
564        if result:
565            return result
566
567        if http_err:
568            args = (dict, 'default', 'http_error_default') + orig_args
569            return self._call_chain(*args)
570
571# XXX probably also want an abstract factory that knows when it makes
572# sense to skip a superclass in favor of a subclass and when it might
573# make sense to include both
574
575def build_opener(*handlers):
576    """Create an opener object from a list of handlers.
577
578    The opener will use several default handlers, including support
579    for HTTP, FTP and when applicable HTTPS.
580
581    If any of the handlers passed as arguments are subclasses of the
582    default handlers, the default handlers will not be used.
583    """
584    opener = OpenerDirector()
585    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
586                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
587                       FTPHandler, FileHandler, HTTPErrorProcessor,
588                       DataHandler]
589    if hasattr(http.client, "HTTPSConnection"):
590        default_classes.append(HTTPSHandler)
591    skip = set()
592    for klass in default_classes:
593        for check in handlers:
594            if isinstance(check, type):
595                if issubclass(check, klass):
596                    skip.add(klass)
597            elif isinstance(check, klass):
598                skip.add(klass)
599    for klass in skip:
600        default_classes.remove(klass)
601
602    for klass in default_classes:
603        opener.add_handler(klass())
604
605    for h in handlers:
606        if isinstance(h, type):
607            h = h()
608        opener.add_handler(h)
609    return opener
610
611class BaseHandler:
612    handler_order = 500
613
614    def add_parent(self, parent):
615        self.parent = parent
616
617    def close(self):
618        # Only exists for backwards compatibility
619        pass
620
621    def __lt__(self, other):
622        if not hasattr(other, "handler_order"):
623            # Try to preserve the old behavior of having custom classes
624            # inserted after default ones (works only for custom user
625            # classes which are not aware of handler_order).
626            return True
627        return self.handler_order < other.handler_order
628
629
630class HTTPErrorProcessor(BaseHandler):
631    """Process HTTP error responses."""
632    handler_order = 1000  # after all other processing
633
634    def http_response(self, request, response):
635        code, msg, hdrs = response.code, response.msg, response.info()
636
637        # According to RFC 2616, "2xx" code indicates that the client's
638        # request was successfully received, understood, and accepted.
639        if not (200 <= code < 300):
640            response = self.parent.error(
641                'http', request, response, code, msg, hdrs)
642
643        return response
644
645    https_response = http_response
646
647class HTTPDefaultErrorHandler(BaseHandler):
648    def http_error_default(self, req, fp, code, msg, hdrs):
649        raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651class HTTPRedirectHandler(BaseHandler):
652    # maximum number of redirections to any single URL
653    # this is needed because of the state that cookies introduce
654    max_repeats = 4
655    # maximum total number of redirections (regardless of URL) before
656    # assuming we're in a loop
657    max_redirections = 10
658
659    def redirect_request(self, req, fp, code, msg, headers, newurl):
660        """Return a Request or None in response to a redirect.
661
662        This is called by the http_error_30x methods when a
663        redirection response is received.  If a redirection should
664        take place, return a new Request to allow http_error_30x to
665        perform the redirect.  Otherwise, raise HTTPError if no-one
666        else should try to handle this url.  Return None if you can't
667        but another Handler might.
668        """
669        m = req.get_method()
670        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
671            or code in (301, 302, 303) and m == "POST")):
672            raise HTTPError(req.full_url, code, msg, headers, fp)
673
674        # Strictly (according to RFC 2616), 301 or 302 in response to
675        # a POST MUST NOT cause a redirection without confirmation
676        # from the user (of urllib.request, in this case).  In practice,
677        # essentially all clients do redirect in this case, so we do
678        # the same.
679
680        # Be conciliant with URIs containing a space.  This is mainly
681        # redundant with the more complete encoding done in http_error_302(),
682        # but it is kept for compatibility with other callers.
683        newurl = newurl.replace(' ', '%20')
684
685        CONTENT_HEADERS = ("content-length", "content-type")
686        newheaders = {k: v for k, v in req.headers.items()
687                      if k.lower() not in CONTENT_HEADERS}
688        return Request(newurl,
689                       headers=newheaders,
690                       origin_req_host=req.origin_req_host,
691                       unverifiable=True)
692
693    # Implementation note: To avoid the server sending us into an
694    # infinite loop, the request object needs to track what URLs we
695    # have already seen.  Do this by adding a handler-specific
696    # attribute to the Request object.
697    def http_error_302(self, req, fp, code, msg, headers):
698        # Some servers (incorrectly) return multiple Location headers
699        # (so probably same goes for URI).  Use first header.
700        if "location" in headers:
701            newurl = headers["location"]
702        elif "uri" in headers:
703            newurl = headers["uri"]
704        else:
705            return
706
707        # fix a possible malformed URL
708        urlparts = urlparse(newurl)
709
710        # For security reasons we don't allow redirection to anything other
711        # than http, https or ftp.
712
713        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
714            raise HTTPError(
715                newurl, code,
716                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
717                headers, fp)
718
719        if not urlparts.path and urlparts.netloc:
720            urlparts = list(urlparts)
721            urlparts[2] = "/"
722        newurl = urlunparse(urlparts)
723
724        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
725        # original bytes and percent-encode non-ASCII bytes, and any special
726        # characters such as the space.
727        newurl = quote(
728            newurl, encoding="iso-8859-1", safe=string.punctuation)
729        newurl = urljoin(req.full_url, newurl)
730
731        # XXX Probably want to forget about the state of the current
732        # request, although that might interact poorly with other
733        # handlers that also use handler-specific request attributes
734        new = self.redirect_request(req, fp, code, msg, headers, newurl)
735        if new is None:
736            return
737
738        # loop detection
739        # .redirect_dict has a key url if url was previously visited.
740        if hasattr(req, 'redirect_dict'):
741            visited = new.redirect_dict = req.redirect_dict
742            if (visited.get(newurl, 0) >= self.max_repeats or
743                len(visited) >= self.max_redirections):
744                raise HTTPError(req.full_url, code,
745                                self.inf_msg + msg, headers, fp)
746        else:
747            visited = new.redirect_dict = req.redirect_dict = {}
748        visited[newurl] = visited.get(newurl, 0) + 1
749
750        # Don't close the fp until we are sure that we won't use it
751        # with HTTPError.
752        fp.read()
753        fp.close()
754
755        return self.parent.open(new, timeout=req.timeout)
756
757    http_error_301 = http_error_303 = http_error_307 = http_error_302
758
759    inf_msg = "The HTTP server returned a redirect error that would " \
760              "lead to an infinite loop.\n" \
761              "The last 30x error message was:\n"
762
763
764def _parse_proxy(proxy):
765    """Return (scheme, user, password, host/port) given a URL or an authority.
766
767    If a URL is supplied, it must have an authority (host:port) component.
768    According to RFC 3986, having an authority component means the URL must
769    have two slashes after the scheme.
770    """
771    scheme, r_scheme = _splittype(proxy)
772    if not r_scheme.startswith("/"):
773        # authority
774        scheme = None
775        authority = proxy
776    else:
777        # URL
778        if not r_scheme.startswith("//"):
779            raise ValueError("proxy URL with no authority: %r" % proxy)
780        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
781        # and 3.3.), path is empty or starts with '/'
782        end = r_scheme.find("/", 2)
783        if end == -1:
784            end = None
785        authority = r_scheme[2:end]
786    userinfo, hostport = _splituser(authority)
787    if userinfo is not None:
788        user, password = _splitpasswd(userinfo)
789    else:
790        user = password = None
791    return scheme, user, password, hostport
792
793class ProxyHandler(BaseHandler):
794    # Proxies must be in front
795    handler_order = 100
796
797    def __init__(self, proxies=None):
798        if proxies is None:
799            proxies = getproxies()
800        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
801        self.proxies = proxies
802        for type, url in proxies.items():
803            type = type.lower()
804            setattr(self, '%s_open' % type,
805                    lambda r, proxy=url, type=type, meth=self.proxy_open:
806                        meth(r, proxy, type))
807
808    def proxy_open(self, req, proxy, type):
809        orig_type = req.type
810        proxy_type, user, password, hostport = _parse_proxy(proxy)
811        if proxy_type is None:
812            proxy_type = orig_type
813
814        if req.host and proxy_bypass(req.host):
815            return None
816
817        if user and password:
818            user_pass = '%s:%s' % (unquote(user),
819                                   unquote(password))
820            creds = base64.b64encode(user_pass.encode()).decode("ascii")
821            req.add_header('Proxy-authorization', 'Basic ' + creds)
822        hostport = unquote(hostport)
823        req.set_proxy(hostport, proxy_type)
824        if orig_type == proxy_type or orig_type == 'https':
825            # let other handlers take care of it
826            return None
827        else:
828            # need to start over, because the other handlers don't
829            # grok the proxy's URL type
830            # e.g. if we have a constructor arg proxies like so:
831            # {'http': 'ftp://proxy.example.com'}, we may end up turning
832            # a request for http://acme.example.com/a into one for
833            # ftp://proxy.example.com/a
834            return self.parent.open(req, timeout=req.timeout)
835
836class HTTPPasswordMgr:
837
838    def __init__(self):
839        self.passwd = {}
840
841    def add_password(self, realm, uri, user, passwd):
842        # uri could be a single URI or a sequence
843        if isinstance(uri, str):
844            uri = [uri]
845        if realm not in self.passwd:
846            self.passwd[realm] = {}
847        for default_port in True, False:
848            reduced_uri = tuple(
849                self.reduce_uri(u, default_port) for u in uri)
850            self.passwd[realm][reduced_uri] = (user, passwd)
851
852    def find_user_password(self, realm, authuri):
853        domains = self.passwd.get(realm, {})
854        for default_port in True, False:
855            reduced_authuri = self.reduce_uri(authuri, default_port)
856            for uris, authinfo in domains.items():
857                for uri in uris:
858                    if self.is_suburi(uri, reduced_authuri):
859                        return authinfo
860        return None, None
861
862    def reduce_uri(self, uri, default_port=True):
863        """Accept authority or URI and extract only the authority and path."""
864        # note HTTP URLs do not have a userinfo component
865        parts = urlsplit(uri)
866        if parts[1]:
867            # URI
868            scheme = parts[0]
869            authority = parts[1]
870            path = parts[2] or '/'
871        else:
872            # host or host:port
873            scheme = None
874            authority = uri
875            path = '/'
876        host, port = _splitport(authority)
877        if default_port and port is None and scheme is not None:
878            dport = {"http": 80,
879                     "https": 443,
880                     }.get(scheme)
881            if dport is not None:
882                authority = "%s:%d" % (host, dport)
883        return authority, path
884
885    def is_suburi(self, base, test):
886        """Check if test is below base in a URI tree
887
888        Both args must be URIs in reduced form.
889        """
890        if base == test:
891            return True
892        if base[0] != test[0]:
893            return False
894        common = posixpath.commonprefix((base[1], test[1]))
895        if len(common) == len(base[1]):
896            return True
897        return False
898
899
900class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
901
902    def find_user_password(self, realm, authuri):
903        user, password = HTTPPasswordMgr.find_user_password(self, realm,
904                                                            authuri)
905        if user is not None:
906            return user, password
907        return HTTPPasswordMgr.find_user_password(self, None, authuri)
908
909
910class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
911
912    def __init__(self, *args, **kwargs):
913        self.authenticated = {}
914        super().__init__(*args, **kwargs)
915
916    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
917        self.update_authenticated(uri, is_authenticated)
918        # Add a default for prior auth requests
919        if realm is not None:
920            super().add_password(None, uri, user, passwd)
921        super().add_password(realm, uri, user, passwd)
922
923    def update_authenticated(self, uri, is_authenticated=False):
924        # uri could be a single URI or a sequence
925        if isinstance(uri, str):
926            uri = [uri]
927
928        for default_port in True, False:
929            for u in uri:
930                reduced_uri = self.reduce_uri(u, default_port)
931                self.authenticated[reduced_uri] = is_authenticated
932
933    def is_authenticated(self, authuri):
934        for default_port in True, False:
935            reduced_authuri = self.reduce_uri(authuri, default_port)
936            for uri in self.authenticated:
937                if self.is_suburi(uri, reduced_authuri):
938                    return self.authenticated[uri]
939
940
941class AbstractBasicAuthHandler:
942
943    # XXX this allows for multiple auth-schemes, but will stupidly pick
944    # the last one with a realm specified.
945
946    # allow for double- and single-quoted realm values
947    # (single quotes are a violation of the RFC, but appear in the wild)
948    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
949                    'realm=(["\']?)([^"\']*)\\2', re.I)
950
951    # XXX could pre-emptively send auth info already accepted (RFC 2617,
952    # end of section 2, and section 1.2 immediately after "credentials"
953    # production).
954
955    def __init__(self, password_mgr=None):
956        if password_mgr is None:
957            password_mgr = HTTPPasswordMgr()
958        self.passwd = password_mgr
959        self.add_password = self.passwd.add_password
960
961    def http_error_auth_reqed(self, authreq, host, req, headers):
962        # host may be an authority (without userinfo) or a URL with an
963        # authority
964        # XXX could be multiple headers
965        authreq = headers.get(authreq, None)
966
967        if authreq:
968            scheme = authreq.split()[0]
969            if scheme.lower() != 'basic':
970                raise ValueError("AbstractBasicAuthHandler does not"
971                                 " support the following scheme: '%s'" %
972                                 scheme)
973            else:
974                mo = AbstractBasicAuthHandler.rx.search(authreq)
975                if mo:
976                    scheme, quote, realm = mo.groups()
977                    if quote not in ['"',"'"]:
978                        warnings.warn("Basic Auth Realm was unquoted",
979                                      UserWarning, 2)
980                    if scheme.lower() == 'basic':
981                        return self.retry_http_basic_auth(host, req, realm)
982
983    def retry_http_basic_auth(self, host, req, realm):
984        user, pw = self.passwd.find_user_password(realm, host)
985        if pw is not None:
986            raw = "%s:%s" % (user, pw)
987            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
988            if req.get_header(self.auth_header, None) == auth:
989                return None
990            req.add_unredirected_header(self.auth_header, auth)
991            return self.parent.open(req, timeout=req.timeout)
992        else:
993            return None
994
995    def http_request(self, req):
996        if (not hasattr(self.passwd, 'is_authenticated') or
997           not self.passwd.is_authenticated(req.full_url)):
998            return req
999
1000        if not req.has_header('Authorization'):
1001            user, passwd = self.passwd.find_user_password(None, req.full_url)
1002            credentials = '{0}:{1}'.format(user, passwd).encode()
1003            auth_str = base64.standard_b64encode(credentials).decode()
1004            req.add_unredirected_header('Authorization',
1005                                        'Basic {}'.format(auth_str.strip()))
1006        return req
1007
1008    def http_response(self, req, response):
1009        if hasattr(self.passwd, 'is_authenticated'):
1010            if 200 <= response.code < 300:
1011                self.passwd.update_authenticated(req.full_url, True)
1012            else:
1013                self.passwd.update_authenticated(req.full_url, False)
1014        return response
1015
1016    https_request = http_request
1017    https_response = http_response
1018
1019
1020
1021class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1022
1023    auth_header = 'Authorization'
1024
1025    def http_error_401(self, req, fp, code, msg, headers):
1026        url = req.full_url
1027        response = self.http_error_auth_reqed('www-authenticate',
1028                                          url, req, headers)
1029        return response
1030
1031
1032class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1033
1034    auth_header = 'Proxy-authorization'
1035
1036    def http_error_407(self, req, fp, code, msg, headers):
1037        # http_error_auth_reqed requires that there is no userinfo component in
1038        # authority.  Assume there isn't one, since urllib.request does not (and
1039        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1040        # userinfo.
1041        authority = req.host
1042        response = self.http_error_auth_reqed('proxy-authenticate',
1043                                          authority, req, headers)
1044        return response
1045
1046
1047# Return n random bytes.
1048_randombytes = os.urandom
1049
1050
1051class AbstractDigestAuthHandler:
1052    # Digest authentication is specified in RFC 2617.
1053
1054    # XXX The client does not inspect the Authentication-Info header
1055    # in a successful response.
1056
1057    # XXX It should be possible to test this implementation against
1058    # a mock server that just generates a static set of challenges.
1059
1060    # XXX qop="auth-int" supports is shaky
1061
1062    def __init__(self, passwd=None):
1063        if passwd is None:
1064            passwd = HTTPPasswordMgr()
1065        self.passwd = passwd
1066        self.add_password = self.passwd.add_password
1067        self.retried = 0
1068        self.nonce_count = 0
1069        self.last_nonce = None
1070
1071    def reset_retry_count(self):
1072        self.retried = 0
1073
1074    def http_error_auth_reqed(self, auth_header, host, req, headers):
1075        authreq = headers.get(auth_header, None)
1076        if self.retried > 5:
1077            # Don't fail endlessly - if we failed once, we'll probably
1078            # fail a second time. Hm. Unless the Password Manager is
1079            # prompting for the information. Crap. This isn't great
1080            # but it's better than the current 'repeat until recursion
1081            # depth exceeded' approach <wink>
1082            raise HTTPError(req.full_url, 401, "digest auth failed",
1083                            headers, None)
1084        else:
1085            self.retried += 1
1086        if authreq:
1087            scheme = authreq.split()[0]
1088            if scheme.lower() == 'digest':
1089                return self.retry_http_digest_auth(req, authreq)
1090            elif scheme.lower() != 'basic':
1091                raise ValueError("AbstractDigestAuthHandler does not support"
1092                                 " the following scheme: '%s'" % scheme)
1093
1094    def retry_http_digest_auth(self, req, auth):
1095        token, challenge = auth.split(' ', 1)
1096        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1097        auth = self.get_authorization(req, chal)
1098        if auth:
1099            auth_val = 'Digest %s' % auth
1100            if req.headers.get(self.auth_header, None) == auth_val:
1101                return None
1102            req.add_unredirected_header(self.auth_header, auth_val)
1103            resp = self.parent.open(req, timeout=req.timeout)
1104            return resp
1105
1106    def get_cnonce(self, nonce):
1107        # The cnonce-value is an opaque
1108        # quoted string value provided by the client and used by both client
1109        # and server to avoid chosen plaintext attacks, to provide mutual
1110        # authentication, and to provide some message integrity protection.
1111        # This isn't a fabulous effort, but it's probably Good Enough.
1112        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1113        b = s.encode("ascii") + _randombytes(8)
1114        dig = hashlib.sha1(b).hexdigest()
1115        return dig[:16]
1116
1117    def get_authorization(self, req, chal):
1118        try:
1119            realm = chal['realm']
1120            nonce = chal['nonce']
1121            qop = chal.get('qop')
1122            algorithm = chal.get('algorithm', 'MD5')
1123            # mod_digest doesn't send an opaque, even though it isn't
1124            # supposed to be optional
1125            opaque = chal.get('opaque', None)
1126        except KeyError:
1127            return None
1128
1129        H, KD = self.get_algorithm_impls(algorithm)
1130        if H is None:
1131            return None
1132
1133        user, pw = self.passwd.find_user_password(realm, req.full_url)
1134        if user is None:
1135            return None
1136
1137        # XXX not implemented yet
1138        if req.data is not None:
1139            entdig = self.get_entity_digest(req.data, chal)
1140        else:
1141            entdig = None
1142
1143        A1 = "%s:%s:%s" % (user, realm, pw)
1144        A2 = "%s:%s" % (req.get_method(),
1145                        # XXX selector: what about proxies and full urls
1146                        req.selector)
1147        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1148        #     or `auth-int` to the response back. we use `auth` to send the response back.
1149        if 'auth' in qop.split(','):
1150            if nonce == self.last_nonce:
1151                self.nonce_count += 1
1152            else:
1153                self.nonce_count = 1
1154                self.last_nonce = nonce
1155            ncvalue = '%08x' % self.nonce_count
1156            cnonce = self.get_cnonce(nonce)
1157            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1158            respdig = KD(H(A1), noncebit)
1159        elif qop is None:
1160            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1161        else:
1162            # XXX handle auth-int.
1163            raise URLError("qop '%s' is not supported." % qop)
1164
1165        # XXX should the partial digests be encoded too?
1166
1167        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1168               'response="%s"' % (user, realm, nonce, req.selector,
1169                                  respdig)
1170        if opaque:
1171            base += ', opaque="%s"' % opaque
1172        if entdig:
1173            base += ', digest="%s"' % entdig
1174        base += ', algorithm="%s"' % algorithm
1175        if qop:
1176            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1177        return base
1178
1179    def get_algorithm_impls(self, algorithm):
1180        # lambdas assume digest modules are imported at the top level
1181        if algorithm == 'MD5':
1182            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1183        elif algorithm == 'SHA':
1184            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1185        # XXX MD5-sess
1186        else:
1187            raise ValueError("Unsupported digest authentication "
1188                             "algorithm %r" % algorithm)
1189        KD = lambda s, d: H("%s:%s" % (s, d))
1190        return H, KD
1191
1192    def get_entity_digest(self, data, chal):
1193        # XXX not implemented yet
1194        return None
1195
1196
1197class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1198    """An authentication protocol defined by RFC 2069
1199
1200    Digest authentication improves on basic authentication because it
1201    does not transmit passwords in the clear.
1202    """
1203
1204    auth_header = 'Authorization'
1205    handler_order = 490  # before Basic auth
1206
1207    def http_error_401(self, req, fp, code, msg, headers):
1208        host = urlparse(req.full_url)[1]
1209        retry = self.http_error_auth_reqed('www-authenticate',
1210                                           host, req, headers)
1211        self.reset_retry_count()
1212        return retry
1213
1214
1215class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1216
1217    auth_header = 'Proxy-Authorization'
1218    handler_order = 490  # before Basic auth
1219
1220    def http_error_407(self, req, fp, code, msg, headers):
1221        host = req.host
1222        retry = self.http_error_auth_reqed('proxy-authenticate',
1223                                           host, req, headers)
1224        self.reset_retry_count()
1225        return retry
1226
1227class AbstractHTTPHandler(BaseHandler):
1228
1229    def __init__(self, debuglevel=0):
1230        self._debuglevel = debuglevel
1231
1232    def set_http_debuglevel(self, level):
1233        self._debuglevel = level
1234
1235    def _get_content_length(self, request):
1236        return http.client.HTTPConnection._get_content_length(
1237            request.data,
1238            request.get_method())
1239
1240    def do_request_(self, request):
1241        host = request.host
1242        if not host:
1243            raise URLError('no host given')
1244
1245        if request.data is not None:  # POST
1246            data = request.data
1247            if isinstance(data, str):
1248                msg = "POST data should be bytes, an iterable of bytes, " \
1249                      "or a file object. It cannot be of type str."
1250                raise TypeError(msg)
1251            if not request.has_header('Content-type'):
1252                request.add_unredirected_header(
1253                    'Content-type',
1254                    'application/x-www-form-urlencoded')
1255            if (not request.has_header('Content-length')
1256                    and not request.has_header('Transfer-encoding')):
1257                content_length = self._get_content_length(request)
1258                if content_length is not None:
1259                    request.add_unredirected_header(
1260                            'Content-length', str(content_length))
1261                else:
1262                    request.add_unredirected_header(
1263                            'Transfer-encoding', 'chunked')
1264
1265        sel_host = host
1266        if request.has_proxy():
1267            scheme, sel = _splittype(request.selector)
1268            sel_host, sel_path = _splithost(sel)
1269        if not request.has_header('Host'):
1270            request.add_unredirected_header('Host', sel_host)
1271        for name, value in self.parent.addheaders:
1272            name = name.capitalize()
1273            if not request.has_header(name):
1274                request.add_unredirected_header(name, value)
1275
1276        return request
1277
1278    def do_open(self, http_class, req, **http_conn_args):
1279        """Return an HTTPResponse object for the request, using http_class.
1280
1281        http_class must implement the HTTPConnection API from http.client.
1282        """
1283        host = req.host
1284        if not host:
1285            raise URLError('no host given')
1286
1287        # will parse host:port
1288        h = http_class(host, timeout=req.timeout, **http_conn_args)
1289        h.set_debuglevel(self._debuglevel)
1290
1291        headers = dict(req.unredirected_hdrs)
1292        headers.update({k: v for k, v in req.headers.items()
1293                        if k not in headers})
1294
1295        # TODO(jhylton): Should this be redesigned to handle
1296        # persistent connections?
1297
1298        # We want to make an HTTP/1.1 request, but the addinfourl
1299        # class isn't prepared to deal with a persistent connection.
1300        # It will try to read all remaining data from the socket,
1301        # which will block while the server waits for the next request.
1302        # So make sure the connection gets closed after the (only)
1303        # request.
1304        headers["Connection"] = "close"
1305        headers = {name.title(): val for name, val in headers.items()}
1306
1307        if req._tunnel_host:
1308            tunnel_headers = {}
1309            proxy_auth_hdr = "Proxy-Authorization"
1310            if proxy_auth_hdr in headers:
1311                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1312                # Proxy-Authorization should not be sent to origin
1313                # server.
1314                del headers[proxy_auth_hdr]
1315            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1316
1317        try:
1318            try:
1319                h.request(req.get_method(), req.selector, req.data, headers,
1320                          encode_chunked=req.has_header('Transfer-encoding'))
1321            except OSError as err: # timeout error
1322                raise URLError(err)
1323            r = h.getresponse()
1324        except:
1325            h.close()
1326            raise
1327
1328        # If the server does not send us a 'Connection: close' header,
1329        # HTTPConnection assumes the socket should be left open. Manually
1330        # mark the socket to be closed when this response object goes away.
1331        if h.sock:
1332            h.sock.close()
1333            h.sock = None
1334
1335        r.url = req.get_full_url()
1336        # This line replaces the .msg attribute of the HTTPResponse
1337        # with .headers, because urllib clients expect the response to
1338        # have the reason in .msg.  It would be good to mark this
1339        # attribute is deprecated and get then to use info() or
1340        # .headers.
1341        r.msg = r.reason
1342        return r
1343
1344
1345class HTTPHandler(AbstractHTTPHandler):
1346
1347    def http_open(self, req):
1348        return self.do_open(http.client.HTTPConnection, req)
1349
1350    http_request = AbstractHTTPHandler.do_request_
1351
1352if hasattr(http.client, 'HTTPSConnection'):
1353
1354    class HTTPSHandler(AbstractHTTPHandler):
1355
1356        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1357            AbstractHTTPHandler.__init__(self, debuglevel)
1358            self._context = context
1359            self._check_hostname = check_hostname
1360
1361        def https_open(self, req):
1362            return self.do_open(http.client.HTTPSConnection, req,
1363                context=self._context, check_hostname=self._check_hostname)
1364
1365        https_request = AbstractHTTPHandler.do_request_
1366
1367    __all__.append('HTTPSHandler')
1368
1369class HTTPCookieProcessor(BaseHandler):
1370    def __init__(self, cookiejar=None):
1371        import http.cookiejar
1372        if cookiejar is None:
1373            cookiejar = http.cookiejar.CookieJar()
1374        self.cookiejar = cookiejar
1375
1376    def http_request(self, request):
1377        self.cookiejar.add_cookie_header(request)
1378        return request
1379
1380    def http_response(self, request, response):
1381        self.cookiejar.extract_cookies(response, request)
1382        return response
1383
1384    https_request = http_request
1385    https_response = http_response
1386
1387class UnknownHandler(BaseHandler):
1388    def unknown_open(self, req):
1389        type = req.type
1390        raise URLError('unknown url type: %s' % type)
1391
1392def parse_keqv_list(l):
1393    """Parse list of key=value strings where keys are not duplicated."""
1394    parsed = {}
1395    for elt in l:
1396        k, v = elt.split('=', 1)
1397        if v[0] == '"' and v[-1] == '"':
1398            v = v[1:-1]
1399        parsed[k] = v
1400    return parsed
1401
1402def parse_http_list(s):
1403    """Parse lists as described by RFC 2068 Section 2.
1404
1405    In particular, parse comma-separated lists where the elements of
1406    the list may include quoted-strings.  A quoted-string could
1407    contain a comma.  A non-quoted string could have quotes in the
1408    middle.  Neither commas nor quotes count if they are escaped.
1409    Only double-quotes count, not single-quotes.
1410    """
1411    res = []
1412    part = ''
1413
1414    escape = quote = False
1415    for cur in s:
1416        if escape:
1417            part += cur
1418            escape = False
1419            continue
1420        if quote:
1421            if cur == '\\':
1422                escape = True
1423                continue
1424            elif cur == '"':
1425                quote = False
1426            part += cur
1427            continue
1428
1429        if cur == ',':
1430            res.append(part)
1431            part = ''
1432            continue
1433
1434        if cur == '"':
1435            quote = True
1436
1437        part += cur
1438
1439    # append last part
1440    if part:
1441        res.append(part)
1442
1443    return [part.strip() for part in res]
1444
1445class FileHandler(BaseHandler):
1446    # Use local file or FTP depending on form of URL
1447    def file_open(self, req):
1448        url = req.selector
1449        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1450                req.host != 'localhost'):
1451            if not req.host in self.get_names():
1452                raise URLError("file:// scheme is supported only on localhost")
1453        else:
1454            return self.open_local_file(req)
1455
1456    # names for the localhost
1457    names = None
1458    def get_names(self):
1459        if FileHandler.names is None:
1460            try:
1461                FileHandler.names = tuple(
1462                    socket.gethostbyname_ex('localhost')[2] +
1463                    socket.gethostbyname_ex(socket.gethostname())[2])
1464            except socket.gaierror:
1465                FileHandler.names = (socket.gethostbyname('localhost'),)
1466        return FileHandler.names
1467
1468    # not entirely sure what the rules are here
1469    def open_local_file(self, req):
1470        import email.utils
1471        import mimetypes
1472        host = req.host
1473        filename = req.selector
1474        localfile = url2pathname(filename)
1475        try:
1476            stats = os.stat(localfile)
1477            size = stats.st_size
1478            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1479            mtype = mimetypes.guess_type(filename)[0]
1480            headers = email.message_from_string(
1481                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1482                (mtype or 'text/plain', size, modified))
1483            if host:
1484                host, port = _splitport(host)
1485            if not host or \
1486                (not port and _safe_gethostbyname(host) in self.get_names()):
1487                if host:
1488                    origurl = 'file://' + host + filename
1489                else:
1490                    origurl = 'file://' + filename
1491                return addinfourl(open(localfile, 'rb'), headers, origurl)
1492        except OSError as exp:
1493            raise URLError(exp)
1494        raise URLError('file not on local host')
1495
1496def _safe_gethostbyname(host):
1497    try:
1498        return socket.gethostbyname(host)
1499    except socket.gaierror:
1500        return None
1501
1502class FTPHandler(BaseHandler):
1503    def ftp_open(self, req):
1504        import ftplib
1505        import mimetypes
1506        host = req.host
1507        if not host:
1508            raise URLError('ftp error: no host given')
1509        host, port = _splitport(host)
1510        if port is None:
1511            port = ftplib.FTP_PORT
1512        else:
1513            port = int(port)
1514
1515        # username/password handling
1516        user, host = _splituser(host)
1517        if user:
1518            user, passwd = _splitpasswd(user)
1519        else:
1520            passwd = None
1521        host = unquote(host)
1522        user = user or ''
1523        passwd = passwd or ''
1524
1525        try:
1526            host = socket.gethostbyname(host)
1527        except OSError as msg:
1528            raise URLError(msg)
1529        path, attrs = _splitattr(req.selector)
1530        dirs = path.split('/')
1531        dirs = list(map(unquote, dirs))
1532        dirs, file = dirs[:-1], dirs[-1]
1533        if dirs and not dirs[0]:
1534            dirs = dirs[1:]
1535        try:
1536            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1537            type = file and 'I' or 'D'
1538            for attr in attrs:
1539                attr, value = _splitvalue(attr)
1540                if attr.lower() == 'type' and \
1541                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1542                    type = value.upper()
1543            fp, retrlen = fw.retrfile(file, type)
1544            headers = ""
1545            mtype = mimetypes.guess_type(req.full_url)[0]
1546            if mtype:
1547                headers += "Content-type: %s\n" % mtype
1548            if retrlen is not None and retrlen >= 0:
1549                headers += "Content-length: %d\n" % retrlen
1550            headers = email.message_from_string(headers)
1551            return addinfourl(fp, headers, req.full_url)
1552        except ftplib.all_errors as exp:
1553            exc = URLError('ftp error: %r' % exp)
1554            raise exc.with_traceback(sys.exc_info()[2])
1555
1556    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1557        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1558                          persistent=False)
1559
1560class CacheFTPHandler(FTPHandler):
1561    # XXX would be nice to have pluggable cache strategies
1562    # XXX this stuff is definitely not thread safe
1563    def __init__(self):
1564        self.cache = {}
1565        self.timeout = {}
1566        self.soonest = 0
1567        self.delay = 60
1568        self.max_conns = 16
1569
1570    def setTimeout(self, t):
1571        self.delay = t
1572
1573    def setMaxConns(self, m):
1574        self.max_conns = m
1575
1576    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1577        key = user, host, port, '/'.join(dirs), timeout
1578        if key in self.cache:
1579            self.timeout[key] = time.time() + self.delay
1580        else:
1581            self.cache[key] = ftpwrapper(user, passwd, host, port,
1582                                         dirs, timeout)
1583            self.timeout[key] = time.time() + self.delay
1584        self.check_cache()
1585        return self.cache[key]
1586
1587    def check_cache(self):
1588        # first check for old ones
1589        t = time.time()
1590        if self.soonest <= t:
1591            for k, v in list(self.timeout.items()):
1592                if v < t:
1593                    self.cache[k].close()
1594                    del self.cache[k]
1595                    del self.timeout[k]
1596        self.soonest = min(list(self.timeout.values()))
1597
1598        # then check the size
1599        if len(self.cache) == self.max_conns:
1600            for k, v in list(self.timeout.items()):
1601                if v == self.soonest:
1602                    del self.cache[k]
1603                    del self.timeout[k]
1604                    break
1605            self.soonest = min(list(self.timeout.values()))
1606
1607    def clear_cache(self):
1608        for conn in self.cache.values():
1609            conn.close()
1610        self.cache.clear()
1611        self.timeout.clear()
1612
1613class DataHandler(BaseHandler):
1614    def data_open(self, req):
1615        # data URLs as specified in RFC 2397.
1616        #
1617        # ignores POSTed data
1618        #
1619        # syntax:
1620        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1621        # mediatype := [ type "/" subtype ] *( ";" parameter )
1622        # data      := *urlchar
1623        # parameter := attribute "=" value
1624        url = req.full_url
1625
1626        scheme, data = url.split(":",1)
1627        mediatype, data = data.split(",",1)
1628
1629        # even base64 encoded data URLs might be quoted so unquote in any case:
1630        data = unquote_to_bytes(data)
1631        if mediatype.endswith(";base64"):
1632            data = base64.decodebytes(data)
1633            mediatype = mediatype[:-7]
1634
1635        if not mediatype:
1636            mediatype = "text/plain;charset=US-ASCII"
1637
1638        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1639            (mediatype, len(data)))
1640
1641        return addinfourl(io.BytesIO(data), headers, url)
1642
1643
1644# Code move from the old urllib module
1645
1646MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1647
1648# Helper for non-unix systems
1649if os.name == 'nt':
1650    from nturl2path import url2pathname, pathname2url
1651else:
1652    def url2pathname(pathname):
1653        """OS-specific conversion from a relative URL of the 'file' scheme
1654        to a file system path; not recommended for general use."""
1655        return unquote(pathname)
1656
1657    def pathname2url(pathname):
1658        """OS-specific conversion from a file system path to a relative URL
1659        of the 'file' scheme; not recommended for general use."""
1660        return quote(pathname)
1661
1662
1663ftpcache = {}
1664
1665
1666class URLopener:
1667    """Class to open URLs.
1668    This is a class rather than just a subroutine because we may need
1669    more than one set of global protocol-specific options.
1670    Note -- this is a base class for those who don't want the
1671    automatic handling of errors type 302 (relocated) and 401
1672    (authorization needed)."""
1673
1674    __tempfiles = None
1675
1676    version = "Python-urllib/%s" % __version__
1677
1678    # Constructor
1679    def __init__(self, proxies=None, **x509):
1680        msg = "%(class)s style of invoking requests is deprecated. " \
1681              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1682        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1683        if proxies is None:
1684            proxies = getproxies()
1685        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1686        self.proxies = proxies
1687        self.key_file = x509.get('key_file')
1688        self.cert_file = x509.get('cert_file')
1689        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1690        self.__tempfiles = []
1691        self.__unlink = os.unlink # See cleanup()
1692        self.tempcache = None
1693        # Undocumented feature: if you assign {} to tempcache,
1694        # it is used to cache files retrieved with
1695        # self.retrieve().  This is not enabled by default
1696        # since it does not work for changing documents (and I
1697        # haven't got the logic to check expiration headers
1698        # yet).
1699        self.ftpcache = ftpcache
1700        # Undocumented feature: you can use a different
1701        # ftp cache by assigning to the .ftpcache member;
1702        # in case you want logically independent URL openers
1703        # XXX This is not threadsafe.  Bah.
1704
1705    def __del__(self):
1706        self.close()
1707
1708    def close(self):
1709        self.cleanup()
1710
1711    def cleanup(self):
1712        # This code sometimes runs when the rest of this module
1713        # has already been deleted, so it can't use any globals
1714        # or import anything.
1715        if self.__tempfiles:
1716            for file in self.__tempfiles:
1717                try:
1718                    self.__unlink(file)
1719                except OSError:
1720                    pass
1721            del self.__tempfiles[:]
1722        if self.tempcache:
1723            self.tempcache.clear()
1724
1725    def addheader(self, *args):
1726        """Add a header to be used by the HTTP interface only
1727        e.g. u.addheader('Accept', 'sound/basic')"""
1728        self.addheaders.append(args)
1729
1730    # External interface
1731    def open(self, fullurl, data=None):
1732        """Use URLopener().open(file) instead of open(file, 'r')."""
1733        fullurl = unwrap(_to_bytes(fullurl))
1734        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1735        if self.tempcache and fullurl in self.tempcache:
1736            filename, headers = self.tempcache[fullurl]
1737            fp = open(filename, 'rb')
1738            return addinfourl(fp, headers, fullurl)
1739        urltype, url = _splittype(fullurl)
1740        if not urltype:
1741            urltype = 'file'
1742        if urltype in self.proxies:
1743            proxy = self.proxies[urltype]
1744            urltype, proxyhost = _splittype(proxy)
1745            host, selector = _splithost(proxyhost)
1746            url = (host, fullurl) # Signal special case to open_*()
1747        else:
1748            proxy = None
1749        name = 'open_' + urltype
1750        self.type = urltype
1751        name = name.replace('-', '_')
1752        if not hasattr(self, name) or name == 'open_local_file':
1753            if proxy:
1754                return self.open_unknown_proxy(proxy, fullurl, data)
1755            else:
1756                return self.open_unknown(fullurl, data)
1757        try:
1758            if data is None:
1759                return getattr(self, name)(url)
1760            else:
1761                return getattr(self, name)(url, data)
1762        except (HTTPError, URLError):
1763            raise
1764        except OSError as msg:
1765            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1766
1767    def open_unknown(self, fullurl, data=None):
1768        """Overridable interface to open unknown URL type."""
1769        type, url = _splittype(fullurl)
1770        raise OSError('url error', 'unknown url type', type)
1771
1772    def open_unknown_proxy(self, proxy, fullurl, data=None):
1773        """Overridable interface to open unknown URL type."""
1774        type, url = _splittype(fullurl)
1775        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1776
1777    # External interface
1778    def retrieve(self, url, filename=None, reporthook=None, data=None):
1779        """retrieve(url) returns (filename, headers) for a local object
1780        or (tempfilename, headers) for a remote object."""
1781        url = unwrap(_to_bytes(url))
1782        if self.tempcache and url in self.tempcache:
1783            return self.tempcache[url]
1784        type, url1 = _splittype(url)
1785        if filename is None and (not type or type == 'file'):
1786            try:
1787                fp = self.open_local_file(url1)
1788                hdrs = fp.info()
1789                fp.close()
1790                return url2pathname(_splithost(url1)[1]), hdrs
1791            except OSError as msg:
1792                pass
1793        fp = self.open(url, data)
1794        try:
1795            headers = fp.info()
1796            if filename:
1797                tfp = open(filename, 'wb')
1798            else:
1799                garbage, path = _splittype(url)
1800                garbage, path = _splithost(path or "")
1801                path, garbage = _splitquery(path or "")
1802                path, garbage = _splitattr(path or "")
1803                suffix = os.path.splitext(path)[1]
1804                (fd, filename) = tempfile.mkstemp(suffix)
1805                self.__tempfiles.append(filename)
1806                tfp = os.fdopen(fd, 'wb')
1807            try:
1808                result = filename, headers
1809                if self.tempcache is not None:
1810                    self.tempcache[url] = result
1811                bs = 1024*8
1812                size = -1
1813                read = 0
1814                blocknum = 0
1815                if "content-length" in headers:
1816                    size = int(headers["Content-Length"])
1817                if reporthook:
1818                    reporthook(blocknum, bs, size)
1819                while 1:
1820                    block = fp.read(bs)
1821                    if not block:
1822                        break
1823                    read += len(block)
1824                    tfp.write(block)
1825                    blocknum += 1
1826                    if reporthook:
1827                        reporthook(blocknum, bs, size)
1828            finally:
1829                tfp.close()
1830        finally:
1831            fp.close()
1832
1833        # raise exception if actual size does not match content-length header
1834        if size >= 0 and read < size:
1835            raise ContentTooShortError(
1836                "retrieval incomplete: got only %i out of %i bytes"
1837                % (read, size), result)
1838
1839        return result
1840
1841    # Each method named open_<type> knows how to open that type of URL
1842
1843    def _open_generic_http(self, connection_factory, url, data):
1844        """Make an HTTP connection using connection_class.
1845
1846        This is an internal method that should be called from
1847        open_http() or open_https().
1848
1849        Arguments:
1850        - connection_factory should take a host name and return an
1851          HTTPConnection instance.
1852        - url is the url to retrieval or a host, relative-path pair.
1853        - data is payload for a POST request or None.
1854        """
1855
1856        user_passwd = None
1857        proxy_passwd= None
1858        if isinstance(url, str):
1859            host, selector = _splithost(url)
1860            if host:
1861                user_passwd, host = _splituser(host)
1862                host = unquote(host)
1863            realhost = host
1864        else:
1865            host, selector = url
1866            # check whether the proxy contains authorization information
1867            proxy_passwd, host = _splituser(host)
1868            # now we proceed with the url we want to obtain
1869            urltype, rest = _splittype(selector)
1870            url = rest
1871            user_passwd = None
1872            if urltype.lower() != 'http':
1873                realhost = None
1874            else:
1875                realhost, rest = _splithost(rest)
1876                if realhost:
1877                    user_passwd, realhost = _splituser(realhost)
1878                if user_passwd:
1879                    selector = "%s://%s%s" % (urltype, realhost, rest)
1880                if proxy_bypass(realhost):
1881                    host = realhost
1882
1883        if not host: raise OSError('http error', 'no host given')
1884
1885        if proxy_passwd:
1886            proxy_passwd = unquote(proxy_passwd)
1887            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1888        else:
1889            proxy_auth = None
1890
1891        if user_passwd:
1892            user_passwd = unquote(user_passwd)
1893            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1894        else:
1895            auth = None
1896        http_conn = connection_factory(host)
1897        headers = {}
1898        if proxy_auth:
1899            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1900        if auth:
1901            headers["Authorization"] =  "Basic %s" % auth
1902        if realhost:
1903            headers["Host"] = realhost
1904
1905        # Add Connection:close as we don't support persistent connections yet.
1906        # This helps in closing the socket and avoiding ResourceWarning
1907
1908        headers["Connection"] = "close"
1909
1910        for header, value in self.addheaders:
1911            headers[header] = value
1912
1913        if data is not None:
1914            headers["Content-Type"] = "application/x-www-form-urlencoded"
1915            http_conn.request("POST", selector, data, headers)
1916        else:
1917            http_conn.request("GET", selector, headers=headers)
1918
1919        try:
1920            response = http_conn.getresponse()
1921        except http.client.BadStatusLine:
1922            # something went wrong with the HTTP status line
1923            raise URLError("http protocol error: bad status line")
1924
1925        # According to RFC 2616, "2xx" code indicates that the client's
1926        # request was successfully received, understood, and accepted.
1927        if 200 <= response.status < 300:
1928            return addinfourl(response, response.msg, "http:" + url,
1929                              response.status)
1930        else:
1931            return self.http_error(
1932                url, response.fp,
1933                response.status, response.reason, response.msg, data)
1934
1935    def open_http(self, url, data=None):
1936        """Use HTTP protocol."""
1937        return self._open_generic_http(http.client.HTTPConnection, url, data)
1938
1939    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1940        """Handle http errors.
1941
1942        Derived class can override this, or provide specific handlers
1943        named http_error_DDD where DDD is the 3-digit error code."""
1944        # First check if there's a specific handler for this error
1945        name = 'http_error_%d' % errcode
1946        if hasattr(self, name):
1947            method = getattr(self, name)
1948            if data is None:
1949                result = method(url, fp, errcode, errmsg, headers)
1950            else:
1951                result = method(url, fp, errcode, errmsg, headers, data)
1952            if result: return result
1953        return self.http_error_default(url, fp, errcode, errmsg, headers)
1954
1955    def http_error_default(self, url, fp, errcode, errmsg, headers):
1956        """Default error handler: close the connection and raise OSError."""
1957        fp.close()
1958        raise HTTPError(url, errcode, errmsg, headers, None)
1959
1960    if _have_ssl:
1961        def _https_connection(self, host):
1962            return http.client.HTTPSConnection(host,
1963                                           key_file=self.key_file,
1964                                           cert_file=self.cert_file)
1965
1966        def open_https(self, url, data=None):
1967            """Use HTTPS protocol."""
1968            return self._open_generic_http(self._https_connection, url, data)
1969
1970    def open_file(self, url):
1971        """Use local file or FTP depending on form of URL."""
1972        if not isinstance(url, str):
1973            raise URLError('file error: proxy support for file protocol currently not implemented')
1974        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1975            raise ValueError("file:// scheme is supported only on localhost")
1976        else:
1977            return self.open_local_file(url)
1978
1979    def open_local_file(self, url):
1980        """Use local file."""
1981        import email.utils
1982        import mimetypes
1983        host, file = _splithost(url)
1984        localname = url2pathname(file)
1985        try:
1986            stats = os.stat(localname)
1987        except OSError as e:
1988            raise URLError(e.strerror, e.filename)
1989        size = stats.st_size
1990        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1991        mtype = mimetypes.guess_type(url)[0]
1992        headers = email.message_from_string(
1993            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1994            (mtype or 'text/plain', size, modified))
1995        if not host:
1996            urlfile = file
1997            if file[:1] == '/':
1998                urlfile = 'file://' + file
1999            return addinfourl(open(localname, 'rb'), headers, urlfile)
2000        host, port = _splitport(host)
2001        if (not port
2002           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2003            urlfile = file
2004            if file[:1] == '/':
2005                urlfile = 'file://' + file
2006            elif file[:2] == './':
2007                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2008            return addinfourl(open(localname, 'rb'), headers, urlfile)
2009        raise URLError('local file error: not on local host')
2010
2011    def open_ftp(self, url):
2012        """Use FTP protocol."""
2013        if not isinstance(url, str):
2014            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2015        import mimetypes
2016        host, path = _splithost(url)
2017        if not host: raise URLError('ftp error: no host given')
2018        host, port = _splitport(host)
2019        user, host = _splituser(host)
2020        if user: user, passwd = _splitpasswd(user)
2021        else: passwd = None
2022        host = unquote(host)
2023        user = unquote(user or '')
2024        passwd = unquote(passwd or '')
2025        host = socket.gethostbyname(host)
2026        if not port:
2027            import ftplib
2028            port = ftplib.FTP_PORT
2029        else:
2030            port = int(port)
2031        path, attrs = _splitattr(path)
2032        path = unquote(path)
2033        dirs = path.split('/')
2034        dirs, file = dirs[:-1], dirs[-1]
2035        if dirs and not dirs[0]: dirs = dirs[1:]
2036        if dirs and not dirs[0]: dirs[0] = '/'
2037        key = user, host, port, '/'.join(dirs)
2038        # XXX thread unsafe!
2039        if len(self.ftpcache) > MAXFTPCACHE:
2040            # Prune the cache, rather arbitrarily
2041            for k in list(self.ftpcache):
2042                if k != key:
2043                    v = self.ftpcache[k]
2044                    del self.ftpcache[k]
2045                    v.close()
2046        try:
2047            if key not in self.ftpcache:
2048                self.ftpcache[key] = \
2049                    ftpwrapper(user, passwd, host, port, dirs)
2050            if not file: type = 'D'
2051            else: type = 'I'
2052            for attr in attrs:
2053                attr, value = _splitvalue(attr)
2054                if attr.lower() == 'type' and \
2055                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2056                    type = value.upper()
2057            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2058            mtype = mimetypes.guess_type("ftp:" + url)[0]
2059            headers = ""
2060            if mtype:
2061                headers += "Content-Type: %s\n" % mtype
2062            if retrlen is not None and retrlen >= 0:
2063                headers += "Content-Length: %d\n" % retrlen
2064            headers = email.message_from_string(headers)
2065            return addinfourl(fp, headers, "ftp:" + url)
2066        except ftperrors() as exp:
2067            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2068
2069    def open_data(self, url, data=None):
2070        """Use "data" URL."""
2071        if not isinstance(url, str):
2072            raise URLError('data error: proxy support for data protocol currently not implemented')
2073        # ignore POSTed data
2074        #
2075        # syntax of data URLs:
2076        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2077        # mediatype := [ type "/" subtype ] *( ";" parameter )
2078        # data      := *urlchar
2079        # parameter := attribute "=" value
2080        try:
2081            [type, data] = url.split(',', 1)
2082        except ValueError:
2083            raise OSError('data error', 'bad data URL')
2084        if not type:
2085            type = 'text/plain;charset=US-ASCII'
2086        semi = type.rfind(';')
2087        if semi >= 0 and '=' not in type[semi:]:
2088            encoding = type[semi+1:]
2089            type = type[:semi]
2090        else:
2091            encoding = ''
2092        msg = []
2093        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2094                                            time.gmtime(time.time())))
2095        msg.append('Content-type: %s' % type)
2096        if encoding == 'base64':
2097            # XXX is this encoding/decoding ok?
2098            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2099        else:
2100            data = unquote(data)
2101        msg.append('Content-Length: %d' % len(data))
2102        msg.append('')
2103        msg.append(data)
2104        msg = '\n'.join(msg)
2105        headers = email.message_from_string(msg)
2106        f = io.StringIO(msg)
2107        #f.fileno = None     # needed for addinfourl
2108        return addinfourl(f, headers, url)
2109
2110
2111class FancyURLopener(URLopener):
2112    """Derived class with handlers for errors we can handle (perhaps)."""
2113
2114    def __init__(self, *args, **kwargs):
2115        URLopener.__init__(self, *args, **kwargs)
2116        self.auth_cache = {}
2117        self.tries = 0
2118        self.maxtries = 10
2119
2120    def http_error_default(self, url, fp, errcode, errmsg, headers):
2121        """Default error handling -- don't raise an exception."""
2122        return addinfourl(fp, headers, "http:" + url, errcode)
2123
2124    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2125        """Error 302 -- relocated (temporarily)."""
2126        self.tries += 1
2127        try:
2128            if self.maxtries and self.tries >= self.maxtries:
2129                if hasattr(self, "http_error_500"):
2130                    meth = self.http_error_500
2131                else:
2132                    meth = self.http_error_default
2133                return meth(url, fp, 500,
2134                            "Internal Server Error: Redirect Recursion",
2135                            headers)
2136            result = self.redirect_internal(url, fp, errcode, errmsg,
2137                                            headers, data)
2138            return result
2139        finally:
2140            self.tries = 0
2141
2142    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2143        if 'location' in headers:
2144            newurl = headers['location']
2145        elif 'uri' in headers:
2146            newurl = headers['uri']
2147        else:
2148            return
2149        fp.close()
2150
2151        # In case the server sent a relative URL, join with original:
2152        newurl = urljoin(self.type + ":" + url, newurl)
2153
2154        urlparts = urlparse(newurl)
2155
2156        # For security reasons, we don't allow redirection to anything other
2157        # than http, https and ftp.
2158
2159        # We are using newer HTTPError with older redirect_internal method
2160        # This older method will get deprecated in 3.3
2161
2162        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2163            raise HTTPError(newurl, errcode,
2164                            errmsg +
2165                            " Redirection to url '%s' is not allowed." % newurl,
2166                            headers, fp)
2167
2168        return self.open(newurl)
2169
2170    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2171        """Error 301 -- also relocated (permanently)."""
2172        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2173
2174    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2175        """Error 303 -- also relocated (essentially identical to 302)."""
2176        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2177
2178    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2179        """Error 307 -- relocated, but turn POST into error."""
2180        if data is None:
2181            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2182        else:
2183            return self.http_error_default(url, fp, errcode, errmsg, headers)
2184
2185    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2186            retry=False):
2187        """Error 401 -- authentication required.
2188        This function supports Basic authentication only."""
2189        if 'www-authenticate' not in headers:
2190            URLopener.http_error_default(self, url, fp,
2191                                         errcode, errmsg, headers)
2192        stuff = headers['www-authenticate']
2193        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2194        if not match:
2195            URLopener.http_error_default(self, url, fp,
2196                                         errcode, errmsg, headers)
2197        scheme, realm = match.groups()
2198        if scheme.lower() != 'basic':
2199            URLopener.http_error_default(self, url, fp,
2200                                         errcode, errmsg, headers)
2201        if not retry:
2202            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2203                    headers)
2204        name = 'retry_' + self.type + '_basic_auth'
2205        if data is None:
2206            return getattr(self,name)(url, realm)
2207        else:
2208            return getattr(self,name)(url, realm, data)
2209
2210    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2211            retry=False):
2212        """Error 407 -- proxy authentication required.
2213        This function supports Basic authentication only."""
2214        if 'proxy-authenticate' not in headers:
2215            URLopener.http_error_default(self, url, fp,
2216                                         errcode, errmsg, headers)
2217        stuff = headers['proxy-authenticate']
2218        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2219        if not match:
2220            URLopener.http_error_default(self, url, fp,
2221                                         errcode, errmsg, headers)
2222        scheme, realm = match.groups()
2223        if scheme.lower() != 'basic':
2224            URLopener.http_error_default(self, url, fp,
2225                                         errcode, errmsg, headers)
2226        if not retry:
2227            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2228                    headers)
2229        name = 'retry_proxy_' + self.type + '_basic_auth'
2230        if data is None:
2231            return getattr(self,name)(url, realm)
2232        else:
2233            return getattr(self,name)(url, realm, data)
2234
2235    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2236        host, selector = _splithost(url)
2237        newurl = 'http://' + host + selector
2238        proxy = self.proxies['http']
2239        urltype, proxyhost = _splittype(proxy)
2240        proxyhost, proxyselector = _splithost(proxyhost)
2241        i = proxyhost.find('@') + 1
2242        proxyhost = proxyhost[i:]
2243        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2244        if not (user or passwd): return None
2245        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2246                                  quote(passwd, safe=''), proxyhost)
2247        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2248        if data is None:
2249            return self.open(newurl)
2250        else:
2251            return self.open(newurl, data)
2252
2253    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2254        host, selector = _splithost(url)
2255        newurl = 'https://' + host + selector
2256        proxy = self.proxies['https']
2257        urltype, proxyhost = _splittype(proxy)
2258        proxyhost, proxyselector = _splithost(proxyhost)
2259        i = proxyhost.find('@') + 1
2260        proxyhost = proxyhost[i:]
2261        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2262        if not (user or passwd): return None
2263        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2264                                  quote(passwd, safe=''), proxyhost)
2265        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2266        if data is None:
2267            return self.open(newurl)
2268        else:
2269            return self.open(newurl, data)
2270
2271    def retry_http_basic_auth(self, url, realm, data=None):
2272        host, selector = _splithost(url)
2273        i = host.find('@') + 1
2274        host = host[i:]
2275        user, passwd = self.get_user_passwd(host, realm, i)
2276        if not (user or passwd): return None
2277        host = "%s:%s@%s" % (quote(user, safe=''),
2278                             quote(passwd, safe=''), host)
2279        newurl = 'http://' + host + selector
2280        if data is None:
2281            return self.open(newurl)
2282        else:
2283            return self.open(newurl, data)
2284
2285    def retry_https_basic_auth(self, url, realm, data=None):
2286        host, selector = _splithost(url)
2287        i = host.find('@') + 1
2288        host = host[i:]
2289        user, passwd = self.get_user_passwd(host, realm, i)
2290        if not (user or passwd): return None
2291        host = "%s:%s@%s" % (quote(user, safe=''),
2292                             quote(passwd, safe=''), host)
2293        newurl = 'https://' + host + selector
2294        if data is None:
2295            return self.open(newurl)
2296        else:
2297            return self.open(newurl, data)
2298
2299    def get_user_passwd(self, host, realm, clear_cache=0):
2300        key = realm + '@' + host.lower()
2301        if key in self.auth_cache:
2302            if clear_cache:
2303                del self.auth_cache[key]
2304            else:
2305                return self.auth_cache[key]
2306        user, passwd = self.prompt_user_passwd(host, realm)
2307        if user or passwd: self.auth_cache[key] = (user, passwd)
2308        return user, passwd
2309
2310    def prompt_user_passwd(self, host, realm):
2311        """Override this in a GUI environment!"""
2312        import getpass
2313        try:
2314            user = input("Enter username for %s at %s: " % (realm, host))
2315            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2316                (user, realm, host))
2317            return user, passwd
2318        except KeyboardInterrupt:
2319            print()
2320            return None, None
2321
2322
2323# Utility functions
2324
2325_localhost = None
2326def localhost():
2327    """Return the IP address of the magic hostname 'localhost'."""
2328    global _localhost
2329    if _localhost is None:
2330        _localhost = socket.gethostbyname('localhost')
2331    return _localhost
2332
2333_thishost = None
2334def thishost():
2335    """Return the IP addresses of the current host."""
2336    global _thishost
2337    if _thishost is None:
2338        try:
2339            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2340        except socket.gaierror:
2341            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2342    return _thishost
2343
2344_ftperrors = None
2345def ftperrors():
2346    """Return the set of errors raised by the FTP class."""
2347    global _ftperrors
2348    if _ftperrors is None:
2349        import ftplib
2350        _ftperrors = ftplib.all_errors
2351    return _ftperrors
2352
2353_noheaders = None
2354def noheaders():
2355    """Return an empty email Message object."""
2356    global _noheaders
2357    if _noheaders is None:
2358        _noheaders = email.message_from_string("")
2359    return _noheaders
2360
2361
2362# Utility classes
2363
2364class ftpwrapper:
2365    """Class used by open_ftp() for cache of open FTP connections."""
2366
2367    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2368                 persistent=True):
2369        self.user = user
2370        self.passwd = passwd
2371        self.host = host
2372        self.port = port
2373        self.dirs = dirs
2374        self.timeout = timeout
2375        self.refcount = 0
2376        self.keepalive = persistent
2377        try:
2378            self.init()
2379        except:
2380            self.close()
2381            raise
2382
2383    def init(self):
2384        import ftplib
2385        self.busy = 0
2386        self.ftp = ftplib.FTP()
2387        self.ftp.connect(self.host, self.port, self.timeout)
2388        self.ftp.login(self.user, self.passwd)
2389        _target = '/'.join(self.dirs)
2390        self.ftp.cwd(_target)
2391
2392    def retrfile(self, file, type):
2393        import ftplib
2394        self.endtransfer()
2395        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2396        else: cmd = 'TYPE ' + type; isdir = 0
2397        try:
2398            self.ftp.voidcmd(cmd)
2399        except ftplib.all_errors:
2400            self.init()
2401            self.ftp.voidcmd(cmd)
2402        conn = None
2403        if file and not isdir:
2404            # Try to retrieve as a file
2405            try:
2406                cmd = 'RETR ' + file
2407                conn, retrlen = self.ftp.ntransfercmd(cmd)
2408            except ftplib.error_perm as reason:
2409                if str(reason)[:3] != '550':
2410                    raise URLError('ftp error: %r' % reason).with_traceback(
2411                        sys.exc_info()[2])
2412        if not conn:
2413            # Set transfer mode to ASCII!
2414            self.ftp.voidcmd('TYPE A')
2415            # Try a directory listing. Verify that directory exists.
2416            if file:
2417                pwd = self.ftp.pwd()
2418                try:
2419                    try:
2420                        self.ftp.cwd(file)
2421                    except ftplib.error_perm as reason:
2422                        raise URLError('ftp error: %r' % reason) from reason
2423                finally:
2424                    self.ftp.cwd(pwd)
2425                cmd = 'LIST ' + file
2426            else:
2427                cmd = 'LIST'
2428            conn, retrlen = self.ftp.ntransfercmd(cmd)
2429        self.busy = 1
2430
2431        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2432        self.refcount += 1
2433        conn.close()
2434        # Pass back both a suitably decorated object and a retrieval length
2435        return (ftpobj, retrlen)
2436
2437    def endtransfer(self):
2438        self.busy = 0
2439
2440    def close(self):
2441        self.keepalive = False
2442        if self.refcount <= 0:
2443            self.real_close()
2444
2445    def file_close(self):
2446        self.endtransfer()
2447        self.refcount -= 1
2448        if self.refcount <= 0 and not self.keepalive:
2449            self.real_close()
2450
2451    def real_close(self):
2452        self.endtransfer()
2453        try:
2454            self.ftp.close()
2455        except ftperrors():
2456            pass
2457
2458# Proxy handling
2459def getproxies_environment():
2460    """Return a dictionary of scheme -> proxy server URL mappings.
2461
2462    Scan the environment for variables named <scheme>_proxy;
2463    this seems to be the standard convention.  If you need a
2464    different way, you can pass a proxies dictionary to the
2465    [Fancy]URLopener constructor.
2466
2467    """
2468    proxies = {}
2469    # in order to prefer lowercase variables, process environment in
2470    # two passes: first matches any, second pass matches lowercase only
2471    for name, value in os.environ.items():
2472        name = name.lower()
2473        if value and name[-6:] == '_proxy':
2474            proxies[name[:-6]] = value
2475    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2476    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2477    # header from the client
2478    # If "proxy" is lowercase, it will still be used thanks to the next block
2479    if 'REQUEST_METHOD' in os.environ:
2480        proxies.pop('http', None)
2481    for name, value in os.environ.items():
2482        if name[-6:] == '_proxy':
2483            name = name.lower()
2484            if value:
2485                proxies[name[:-6]] = value
2486            else:
2487                proxies.pop(name[:-6], None)
2488    return proxies
2489
2490def proxy_bypass_environment(host, proxies=None):
2491    """Test if proxies should not be used for a particular host.
2492
2493    Checks the proxy dict for the value of no_proxy, which should
2494    be a list of comma separated DNS suffixes, or '*' for all hosts.
2495
2496    """
2497    if proxies is None:
2498        proxies = getproxies_environment()
2499    # don't bypass, if no_proxy isn't specified
2500    try:
2501        no_proxy = proxies['no']
2502    except KeyError:
2503        return 0
2504    # '*' is special case for always bypass
2505    if no_proxy == '*':
2506        return 1
2507    # strip port off host
2508    hostonly, port = _splitport(host)
2509    # check if the host ends with any of the DNS suffixes
2510    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2511    for name in no_proxy_list:
2512        if name:
2513            name = name.lstrip('.')  # ignore leading dots
2514            name = re.escape(name)
2515            pattern = r'(.+\.)?%s$' % name
2516            if (re.match(pattern, hostonly, re.I)
2517                    or re.match(pattern, host, re.I)):
2518                return 1
2519    # otherwise, don't bypass
2520    return 0
2521
2522
2523# This code tests an OSX specific data structure but is testable on all
2524# platforms
2525def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2526    """
2527    Return True iff this host shouldn't be accessed using a proxy
2528
2529    This function uses the MacOSX framework SystemConfiguration
2530    to fetch the proxy information.
2531
2532    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2533    { 'exclude_simple': bool,
2534      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2535    }
2536    """
2537    from fnmatch import fnmatch
2538
2539    hostonly, port = _splitport(host)
2540
2541    def ip2num(ipAddr):
2542        parts = ipAddr.split('.')
2543        parts = list(map(int, parts))
2544        if len(parts) != 4:
2545            parts = (parts + [0, 0, 0, 0])[:4]
2546        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2547
2548    # Check for simple host names:
2549    if '.' not in host:
2550        if proxy_settings['exclude_simple']:
2551            return True
2552
2553    hostIP = None
2554
2555    for value in proxy_settings.get('exceptions', ()):
2556        # Items in the list are strings like these: *.local, 169.254/16
2557        if not value: continue
2558
2559        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2560        if m is not None:
2561            if hostIP is None:
2562                try:
2563                    hostIP = socket.gethostbyname(hostonly)
2564                    hostIP = ip2num(hostIP)
2565                except OSError:
2566                    continue
2567
2568            base = ip2num(m.group(1))
2569            mask = m.group(2)
2570            if mask is None:
2571                mask = 8 * (m.group(1).count('.') + 1)
2572            else:
2573                mask = int(mask[1:])
2574            mask = 32 - mask
2575
2576            if (hostIP >> mask) == (base >> mask):
2577                return True
2578
2579        elif fnmatch(host, value):
2580            return True
2581
2582    return False
2583
2584
2585if sys.platform == 'darwin':
2586    from _scproxy import _get_proxy_settings, _get_proxies
2587
2588    def proxy_bypass_macosx_sysconf(host):
2589        proxy_settings = _get_proxy_settings()
2590        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2591
2592    def getproxies_macosx_sysconf():
2593        """Return a dictionary of scheme -> proxy server URL mappings.
2594
2595        This function uses the MacOSX framework SystemConfiguration
2596        to fetch the proxy information.
2597        """
2598        return _get_proxies()
2599
2600
2601
2602    def proxy_bypass(host):
2603        """Return True, if host should be bypassed.
2604
2605        Checks proxy settings gathered from the environment, if specified,
2606        or from the MacOSX framework SystemConfiguration.
2607
2608        """
2609        proxies = getproxies_environment()
2610        if proxies:
2611            return proxy_bypass_environment(host, proxies)
2612        else:
2613            return proxy_bypass_macosx_sysconf(host)
2614
2615    def getproxies():
2616        return getproxies_environment() or getproxies_macosx_sysconf()
2617
2618
2619elif os.name == 'nt':
2620    def getproxies_registry():
2621        """Return a dictionary of scheme -> proxy server URL mappings.
2622
2623        Win32 uses the registry to store proxies.
2624
2625        """
2626        proxies = {}
2627        try:
2628            import winreg
2629        except ImportError:
2630            # Std module, so should be around - but you never know!
2631            return proxies
2632        try:
2633            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2634                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2635            proxyEnable = winreg.QueryValueEx(internetSettings,
2636                                               'ProxyEnable')[0]
2637            if proxyEnable:
2638                # Returned as Unicode but problems if not converted to ASCII
2639                proxyServer = str(winreg.QueryValueEx(internetSettings,
2640                                                       'ProxyServer')[0])
2641                if '=' in proxyServer:
2642                    # Per-protocol settings
2643                    for p in proxyServer.split(';'):
2644                        protocol, address = p.split('=', 1)
2645                        # See if address has a type:// prefix
2646                        if not re.match('^([^/:]+)://', address):
2647                            address = '%s://%s' % (protocol, address)
2648                        proxies[protocol] = address
2649                else:
2650                    # Use one setting for all protocols
2651                    if proxyServer[:5] == 'http:':
2652                        proxies['http'] = proxyServer
2653                    else:
2654                        proxies['http'] = 'http://%s' % proxyServer
2655                        proxies['https'] = 'https://%s' % proxyServer
2656                        proxies['ftp'] = 'ftp://%s' % proxyServer
2657            internetSettings.Close()
2658        except (OSError, ValueError, TypeError):
2659            # Either registry key not found etc, or the value in an
2660            # unexpected format.
2661            # proxies already set up to be empty so nothing to do
2662            pass
2663        return proxies
2664
2665    def getproxies():
2666        """Return a dictionary of scheme -> proxy server URL mappings.
2667
2668        Returns settings gathered from the environment, if specified,
2669        or the registry.
2670
2671        """
2672        return getproxies_environment() or getproxies_registry()
2673
2674    def proxy_bypass_registry(host):
2675        try:
2676            import winreg
2677        except ImportError:
2678            # Std modules, so should be around - but you never know!
2679            return 0
2680        try:
2681            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2682                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2683            proxyEnable = winreg.QueryValueEx(internetSettings,
2684                                               'ProxyEnable')[0]
2685            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2686                                                     'ProxyOverride')[0])
2687            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2688        except OSError:
2689            return 0
2690        if not proxyEnable or not proxyOverride:
2691            return 0
2692        # try to make a host list from name and IP address.
2693        rawHost, port = _splitport(host)
2694        host = [rawHost]
2695        try:
2696            addr = socket.gethostbyname(rawHost)
2697            if addr != rawHost:
2698                host.append(addr)
2699        except OSError:
2700            pass
2701        try:
2702            fqdn = socket.getfqdn(rawHost)
2703            if fqdn != rawHost:
2704                host.append(fqdn)
2705        except OSError:
2706            pass
2707        # make a check value list from the registry entry: replace the
2708        # '<local>' string by the localhost entry and the corresponding
2709        # canonical entry.
2710        proxyOverride = proxyOverride.split(';')
2711        # now check if we match one of the registry values.
2712        for test in proxyOverride:
2713            if test == '<local>':
2714                if '.' not in rawHost:
2715                    return 1
2716            test = test.replace(".", r"\.")     # mask dots
2717            test = test.replace("*", r".*")     # change glob sequence
2718            test = test.replace("?", r".")      # change glob char
2719            for val in host:
2720                if re.match(test, val, re.I):
2721                    return 1
2722        return 0
2723
2724    def proxy_bypass(host):
2725        """Return True, if host should be bypassed.
2726
2727        Checks proxy settings gathered from the environment, if specified,
2728        or the registry.
2729
2730        """
2731        proxies = getproxies_environment()
2732        if proxies:
2733            return proxy_bypass_environment(host, proxies)
2734        else:
2735            return proxy_bypass_registry(host)
2736
2737else:
2738    # By default use environment variables
2739    getproxies = getproxies_environment
2740    proxy_bypass = proxy_bypass_environment
2741