• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    _splittype, _splithost, _splitport, _splituser, _splitpasswd,
106    _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166    This function always returns an object which can work as a context
167    manager and has methods such as
168
169    * geturl() - return the URL of the resource retrieved, commonly used to
170      determine if a redirect was followed
171
172    * info() - return the meta-information of the page, such as headers, in the
173      form of an email.message_from_string() instance (see Quick Reference to
174      HTTP Headers)
175
176    * getcode() - return the HTTP status code of the response.  Raises URLError
177      on errors.
178
179    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
180    object slightly modified. In addition to the three new methods above, the
181    msg attribute contains the same information as the reason attribute ---
182    the reason phrase returned by the server --- instead of the response
183    headers as it is specified in the documentation for HTTPResponse.
184
185    For FTP, file, and data URLs and requests explicitly handled by legacy
186    URLopener and FancyURLopener classes, this function returns a
187    urllib.response.addinfourl object.
188
189    Note that None may be returned if no handler handles the request (though
190    the default installed global OpenerDirector uses UnknownHandler to ensure
191    this never happens).
192
193    In addition, if proxy settings are detected (for example, when a *_proxy
194    environment variable like http_proxy is set), ProxyHandler is default
195    installed and makes sure the requests are handled through the proxy.
196
197    '''
198    global _opener
199    if cafile or capath or cadefault:
200        import warnings
201        warnings.warn("cafile, capath and cadefault are deprecated, use a "
202                      "custom context instead.", DeprecationWarning, 2)
203        if context is not None:
204            raise ValueError(
205                "You can't pass both context and any of cafile, capath, and "
206                "cadefault"
207            )
208        if not _have_ssl:
209            raise ValueError('SSL support not available')
210        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
211                                             cafile=cafile,
212                                             capath=capath)
213        https_handler = HTTPSHandler(context=context)
214        opener = build_opener(https_handler)
215    elif context:
216        https_handler = HTTPSHandler(context=context)
217        opener = build_opener(https_handler)
218    elif _opener is None:
219        _opener = opener = build_opener()
220    else:
221        opener = _opener
222    return opener.open(url, data, timeout)
223
224def install_opener(opener):
225    global _opener
226    _opener = opener
227
228_url_tempfiles = []
229def urlretrieve(url, filename=None, reporthook=None, data=None):
230    """
231    Retrieve a URL into a temporary location on disk.
232
233    Requires a URL argument. If a filename is passed, it is used as
234    the temporary file location. The reporthook argument should be
235    a callable that accepts a block number, a read size, and the
236    total file size of the URL target. The data argument should be
237    valid URL encoded data.
238
239    If a filename is passed and the URL points to a local resource,
240    the result is a copy from local file to new file.
241
242    Returns a tuple containing the path to the newly created
243    data file as well as the resulting HTTPMessage object.
244    """
245    url_type, path = _splittype(url)
246
247    with contextlib.closing(urlopen(url, data)) as fp:
248        headers = fp.info()
249
250        # Just return the local path and the "headers" for file://
251        # URLs. No sense in performing a copy unless requested.
252        if url_type == "file" and not filename:
253            return os.path.normpath(path), headers
254
255        # Handle temporary file setup.
256        if filename:
257            tfp = open(filename, 'wb')
258        else:
259            tfp = tempfile.NamedTemporaryFile(delete=False)
260            filename = tfp.name
261            _url_tempfiles.append(filename)
262
263        with tfp:
264            result = filename, headers
265            bs = 1024*8
266            size = -1
267            read = 0
268            blocknum = 0
269            if "content-length" in headers:
270                size = int(headers["Content-Length"])
271
272            if reporthook:
273                reporthook(blocknum, bs, size)
274
275            while True:
276                block = fp.read(bs)
277                if not block:
278                    break
279                read += len(block)
280                tfp.write(block)
281                blocknum += 1
282                if reporthook:
283                    reporthook(blocknum, bs, size)
284
285    if size >= 0 and read < size:
286        raise ContentTooShortError(
287            "retrieval incomplete: got only %i out of %i bytes"
288            % (read, size), result)
289
290    return result
291
292def urlcleanup():
293    """Clean up temporary files from urlretrieve calls."""
294    for temp_file in _url_tempfiles:
295        try:
296            os.unlink(temp_file)
297        except OSError:
298            pass
299
300    del _url_tempfiles[:]
301    global _opener
302    if _opener:
303        _opener = None
304
305# copied from cookielib.py
306_cut_port_re = re.compile(r":\d+$", re.ASCII)
307def request_host(request):
308    """Return request-host, as defined by RFC 2965.
309
310    Variation from RFC: returned value is lowercased, for convenient
311    comparison.
312
313    """
314    url = request.full_url
315    host = urlparse(url)[1]
316    if host == "":
317        host = request.get_header("Host", "")
318
319    # remove port, if present
320    host = _cut_port_re.sub("", host, 1)
321    return host.lower()
322
323class Request:
324
325    def __init__(self, url, data=None, headers={},
326                 origin_req_host=None, unverifiable=False,
327                 method=None):
328        self.full_url = url
329        self.headers = {}
330        self.unredirected_hdrs = {}
331        self._data = None
332        self.data = data
333        self._tunnel_host = None
334        for key, value in headers.items():
335            self.add_header(key, value)
336        if origin_req_host is None:
337            origin_req_host = request_host(self)
338        self.origin_req_host = origin_req_host
339        self.unverifiable = unverifiable
340        if method:
341            self.method = method
342
343    @property
344    def full_url(self):
345        if self.fragment:
346            return '{}#{}'.format(self._full_url, self.fragment)
347        return self._full_url
348
349    @full_url.setter
350    def full_url(self, url):
351        # unwrap('<URL:type://host/path>') --> 'type://host/path'
352        self._full_url = unwrap(url)
353        self._full_url, self.fragment = _splittag(self._full_url)
354        self._parse()
355
356    @full_url.deleter
357    def full_url(self):
358        self._full_url = None
359        self.fragment = None
360        self.selector = ''
361
362    @property
363    def data(self):
364        return self._data
365
366    @data.setter
367    def data(self, data):
368        if data != self._data:
369            self._data = data
370            # issue 16464
371            # if we change data we need to remove content-length header
372            # (cause it's most probably calculated for previous value)
373            if self.has_header("Content-length"):
374                self.remove_header("Content-length")
375
376    @data.deleter
377    def data(self):
378        self.data = None
379
380    def _parse(self):
381        self.type, rest = _splittype(self._full_url)
382        if self.type is None:
383            raise ValueError("unknown url type: %r" % self.full_url)
384        self.host, self.selector = _splithost(rest)
385        if self.host:
386            self.host = unquote(self.host)
387
388    def get_method(self):
389        """Return a string indicating the HTTP request method."""
390        default_method = "POST" if self.data is not None else "GET"
391        return getattr(self, 'method', default_method)
392
393    def get_full_url(self):
394        return self.full_url
395
396    def set_proxy(self, host, type):
397        if self.type == 'https' and not self._tunnel_host:
398            self._tunnel_host = self.host
399        else:
400            self.type= type
401            self.selector = self.full_url
402        self.host = host
403
404    def has_proxy(self):
405        return self.selector == self.full_url
406
407    def add_header(self, key, val):
408        # useful for something like authentication
409        self.headers[key.capitalize()] = val
410
411    def add_unredirected_header(self, key, val):
412        # will not be added to a redirected request
413        self.unredirected_hdrs[key.capitalize()] = val
414
415    def has_header(self, header_name):
416        return (header_name in self.headers or
417                header_name in self.unredirected_hdrs)
418
419    def get_header(self, header_name, default=None):
420        return self.headers.get(
421            header_name,
422            self.unredirected_hdrs.get(header_name, default))
423
424    def remove_header(self, header_name):
425        self.headers.pop(header_name, None)
426        self.unredirected_hdrs.pop(header_name, None)
427
428    def header_items(self):
429        hdrs = {**self.unredirected_hdrs, **self.headers}
430        return list(hdrs.items())
431
432class OpenerDirector:
433    def __init__(self):
434        client_version = "Python-urllib/%s" % __version__
435        self.addheaders = [('User-agent', client_version)]
436        # self.handlers is retained only for backward compatibility
437        self.handlers = []
438        # manage the individual handlers
439        self.handle_open = {}
440        self.handle_error = {}
441        self.process_response = {}
442        self.process_request = {}
443
444    def add_handler(self, handler):
445        if not hasattr(handler, "add_parent"):
446            raise TypeError("expected BaseHandler instance, got %r" %
447                            type(handler))
448
449        added = False
450        for meth in dir(handler):
451            if meth in ["redirect_request", "do_open", "proxy_open"]:
452                # oops, coincidental match
453                continue
454
455            i = meth.find("_")
456            protocol = meth[:i]
457            condition = meth[i+1:]
458
459            if condition.startswith("error"):
460                j = condition.find("_") + i + 1
461                kind = meth[j+1:]
462                try:
463                    kind = int(kind)
464                except ValueError:
465                    pass
466                lookup = self.handle_error.get(protocol, {})
467                self.handle_error[protocol] = lookup
468            elif condition == "open":
469                kind = protocol
470                lookup = self.handle_open
471            elif condition == "response":
472                kind = protocol
473                lookup = self.process_response
474            elif condition == "request":
475                kind = protocol
476                lookup = self.process_request
477            else:
478                continue
479
480            handlers = lookup.setdefault(kind, [])
481            if handlers:
482                bisect.insort(handlers, handler)
483            else:
484                handlers.append(handler)
485            added = True
486
487        if added:
488            bisect.insort(self.handlers, handler)
489            handler.add_parent(self)
490
491    def close(self):
492        # Only exists for backwards compatibility.
493        pass
494
495    def _call_chain(self, chain, kind, meth_name, *args):
496        # Handlers raise an exception if no one else should try to handle
497        # the request, or return None if they can't but another handler
498        # could.  Otherwise, they return the response.
499        handlers = chain.get(kind, ())
500        for handler in handlers:
501            func = getattr(handler, meth_name)
502            result = func(*args)
503            if result is not None:
504                return result
505
506    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
507        # accept a URL or a Request object
508        if isinstance(fullurl, str):
509            req = Request(fullurl, data)
510        else:
511            req = fullurl
512            if data is not None:
513                req.data = data
514
515        req.timeout = timeout
516        protocol = req.type
517
518        # pre-process request
519        meth_name = protocol+"_request"
520        for processor in self.process_request.get(protocol, []):
521            meth = getattr(processor, meth_name)
522            req = meth(req)
523
524        sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
525        response = self._open(req, data)
526
527        # post-process response
528        meth_name = protocol+"_response"
529        for processor in self.process_response.get(protocol, []):
530            meth = getattr(processor, meth_name)
531            response = meth(req, response)
532
533        return response
534
535    def _open(self, req, data=None):
536        result = self._call_chain(self.handle_open, 'default',
537                                  'default_open', req)
538        if result:
539            return result
540
541        protocol = req.type
542        result = self._call_chain(self.handle_open, protocol, protocol +
543                                  '_open', req)
544        if result:
545            return result
546
547        return self._call_chain(self.handle_open, 'unknown',
548                                'unknown_open', req)
549
550    def error(self, proto, *args):
551        if proto in ('http', 'https'):
552            # XXX http[s] protocols are special-cased
553            dict = self.handle_error['http'] # https is not different than http
554            proto = args[2]  # YUCK!
555            meth_name = 'http_error_%s' % proto
556            http_err = 1
557            orig_args = args
558        else:
559            dict = self.handle_error
560            meth_name = proto + '_error'
561            http_err = 0
562        args = (dict, proto, meth_name) + args
563        result = self._call_chain(*args)
564        if result:
565            return result
566
567        if http_err:
568            args = (dict, 'default', 'http_error_default') + orig_args
569            return self._call_chain(*args)
570
571# XXX probably also want an abstract factory that knows when it makes
572# sense to skip a superclass in favor of a subclass and when it might
573# make sense to include both
574
575def build_opener(*handlers):
576    """Create an opener object from a list of handlers.
577
578    The opener will use several default handlers, including support
579    for HTTP, FTP and when applicable HTTPS.
580
581    If any of the handlers passed as arguments are subclasses of the
582    default handlers, the default handlers will not be used.
583    """
584    opener = OpenerDirector()
585    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
586                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
587                       FTPHandler, FileHandler, HTTPErrorProcessor,
588                       DataHandler]
589    if hasattr(http.client, "HTTPSConnection"):
590        default_classes.append(HTTPSHandler)
591    skip = set()
592    for klass in default_classes:
593        for check in handlers:
594            if isinstance(check, type):
595                if issubclass(check, klass):
596                    skip.add(klass)
597            elif isinstance(check, klass):
598                skip.add(klass)
599    for klass in skip:
600        default_classes.remove(klass)
601
602    for klass in default_classes:
603        opener.add_handler(klass())
604
605    for h in handlers:
606        if isinstance(h, type):
607            h = h()
608        opener.add_handler(h)
609    return opener
610
611class BaseHandler:
612    handler_order = 500
613
614    def add_parent(self, parent):
615        self.parent = parent
616
617    def close(self):
618        # Only exists for backwards compatibility
619        pass
620
621    def __lt__(self, other):
622        if not hasattr(other, "handler_order"):
623            # Try to preserve the old behavior of having custom classes
624            # inserted after default ones (works only for custom user
625            # classes which are not aware of handler_order).
626            return True
627        return self.handler_order < other.handler_order
628
629
630class HTTPErrorProcessor(BaseHandler):
631    """Process HTTP error responses."""
632    handler_order = 1000  # after all other processing
633
634    def http_response(self, request, response):
635        code, msg, hdrs = response.code, response.msg, response.info()
636
637        # According to RFC 2616, "2xx" code indicates that the client's
638        # request was successfully received, understood, and accepted.
639        if not (200 <= code < 300):
640            response = self.parent.error(
641                'http', request, response, code, msg, hdrs)
642
643        return response
644
645    https_response = http_response
646
647class HTTPDefaultErrorHandler(BaseHandler):
648    def http_error_default(self, req, fp, code, msg, hdrs):
649        raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651class HTTPRedirectHandler(BaseHandler):
652    # maximum number of redirections to any single URL
653    # this is needed because of the state that cookies introduce
654    max_repeats = 4
655    # maximum total number of redirections (regardless of URL) before
656    # assuming we're in a loop
657    max_redirections = 10
658
659    def redirect_request(self, req, fp, code, msg, headers, newurl):
660        """Return a Request or None in response to a redirect.
661
662        This is called by the http_error_30x methods when a
663        redirection response is received.  If a redirection should
664        take place, return a new Request to allow http_error_30x to
665        perform the redirect.  Otherwise, raise HTTPError if no-one
666        else should try to handle this url.  Return None if you can't
667        but another Handler might.
668        """
669        m = req.get_method()
670        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
671            or code in (301, 302, 303) and m == "POST")):
672            raise HTTPError(req.full_url, code, msg, headers, fp)
673
674        # Strictly (according to RFC 2616), 301 or 302 in response to
675        # a POST MUST NOT cause a redirection without confirmation
676        # from the user (of urllib.request, in this case).  In practice,
677        # essentially all clients do redirect in this case, so we do
678        # the same.
679
680        # Be conciliant with URIs containing a space.  This is mainly
681        # redundant with the more complete encoding done in http_error_302(),
682        # but it is kept for compatibility with other callers.
683        newurl = newurl.replace(' ', '%20')
684
685        CONTENT_HEADERS = ("content-length", "content-type")
686        newheaders = {k: v for k, v in req.headers.items()
687                      if k.lower() not in CONTENT_HEADERS}
688        return Request(newurl,
689                       headers=newheaders,
690                       origin_req_host=req.origin_req_host,
691                       unverifiable=True)
692
693    # Implementation note: To avoid the server sending us into an
694    # infinite loop, the request object needs to track what URLs we
695    # have already seen.  Do this by adding a handler-specific
696    # attribute to the Request object.
697    def http_error_302(self, req, fp, code, msg, headers):
698        # Some servers (incorrectly) return multiple Location headers
699        # (so probably same goes for URI).  Use first header.
700        if "location" in headers:
701            newurl = headers["location"]
702        elif "uri" in headers:
703            newurl = headers["uri"]
704        else:
705            return
706
707        # fix a possible malformed URL
708        urlparts = urlparse(newurl)
709
710        # For security reasons we don't allow redirection to anything other
711        # than http, https or ftp.
712
713        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
714            raise HTTPError(
715                newurl, code,
716                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
717                headers, fp)
718
719        if not urlparts.path and urlparts.netloc:
720            urlparts = list(urlparts)
721            urlparts[2] = "/"
722        newurl = urlunparse(urlparts)
723
724        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
725        # original bytes and percent-encode non-ASCII bytes, and any special
726        # characters such as the space.
727        newurl = quote(
728            newurl, encoding="iso-8859-1", safe=string.punctuation)
729        newurl = urljoin(req.full_url, newurl)
730
731        # XXX Probably want to forget about the state of the current
732        # request, although that might interact poorly with other
733        # handlers that also use handler-specific request attributes
734        new = self.redirect_request(req, fp, code, msg, headers, newurl)
735        if new is None:
736            return
737
738        # loop detection
739        # .redirect_dict has a key url if url was previously visited.
740        if hasattr(req, 'redirect_dict'):
741            visited = new.redirect_dict = req.redirect_dict
742            if (visited.get(newurl, 0) >= self.max_repeats or
743                len(visited) >= self.max_redirections):
744                raise HTTPError(req.full_url, code,
745                                self.inf_msg + msg, headers, fp)
746        else:
747            visited = new.redirect_dict = req.redirect_dict = {}
748        visited[newurl] = visited.get(newurl, 0) + 1
749
750        # Don't close the fp until we are sure that we won't use it
751        # with HTTPError.
752        fp.read()
753        fp.close()
754
755        return self.parent.open(new, timeout=req.timeout)
756
757    http_error_301 = http_error_303 = http_error_307 = http_error_302
758
759    inf_msg = "The HTTP server returned a redirect error that would " \
760              "lead to an infinite loop.\n" \
761              "The last 30x error message was:\n"
762
763
764def _parse_proxy(proxy):
765    """Return (scheme, user, password, host/port) given a URL or an authority.
766
767    If a URL is supplied, it must have an authority (host:port) component.
768    According to RFC 3986, having an authority component means the URL must
769    have two slashes after the scheme.
770    """
771    scheme, r_scheme = _splittype(proxy)
772    if not r_scheme.startswith("/"):
773        # authority
774        scheme = None
775        authority = proxy
776    else:
777        # URL
778        if not r_scheme.startswith("//"):
779            raise ValueError("proxy URL with no authority: %r" % proxy)
780        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
781        # and 3.3.), path is empty or starts with '/'
782        end = r_scheme.find("/", 2)
783        if end == -1:
784            end = None
785        authority = r_scheme[2:end]
786    userinfo, hostport = _splituser(authority)
787    if userinfo is not None:
788        user, password = _splitpasswd(userinfo)
789    else:
790        user = password = None
791    return scheme, user, password, hostport
792
793class ProxyHandler(BaseHandler):
794    # Proxies must be in front
795    handler_order = 100
796
797    def __init__(self, proxies=None):
798        if proxies is None:
799            proxies = getproxies()
800        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
801        self.proxies = proxies
802        for type, url in proxies.items():
803            type = type.lower()
804            setattr(self, '%s_open' % type,
805                    lambda r, proxy=url, type=type, meth=self.proxy_open:
806                        meth(r, proxy, type))
807
808    def proxy_open(self, req, proxy, type):
809        orig_type = req.type
810        proxy_type, user, password, hostport = _parse_proxy(proxy)
811        if proxy_type is None:
812            proxy_type = orig_type
813
814        if req.host and proxy_bypass(req.host):
815            return None
816
817        if user and password:
818            user_pass = '%s:%s' % (unquote(user),
819                                   unquote(password))
820            creds = base64.b64encode(user_pass.encode()).decode("ascii")
821            req.add_header('Proxy-authorization', 'Basic ' + creds)
822        hostport = unquote(hostport)
823        req.set_proxy(hostport, proxy_type)
824        if orig_type == proxy_type or orig_type == 'https':
825            # let other handlers take care of it
826            return None
827        else:
828            # need to start over, because the other handlers don't
829            # grok the proxy's URL type
830            # e.g. if we have a constructor arg proxies like so:
831            # {'http': 'ftp://proxy.example.com'}, we may end up turning
832            # a request for http://acme.example.com/a into one for
833            # ftp://proxy.example.com/a
834            return self.parent.open(req, timeout=req.timeout)
835
836class HTTPPasswordMgr:
837
838    def __init__(self):
839        self.passwd = {}
840
841    def add_password(self, realm, uri, user, passwd):
842        # uri could be a single URI or a sequence
843        if isinstance(uri, str):
844            uri = [uri]
845        if realm not in self.passwd:
846            self.passwd[realm] = {}
847        for default_port in True, False:
848            reduced_uri = tuple(
849                self.reduce_uri(u, default_port) for u in uri)
850            self.passwd[realm][reduced_uri] = (user, passwd)
851
852    def find_user_password(self, realm, authuri):
853        domains = self.passwd.get(realm, {})
854        for default_port in True, False:
855            reduced_authuri = self.reduce_uri(authuri, default_port)
856            for uris, authinfo in domains.items():
857                for uri in uris:
858                    if self.is_suburi(uri, reduced_authuri):
859                        return authinfo
860        return None, None
861
862    def reduce_uri(self, uri, default_port=True):
863        """Accept authority or URI and extract only the authority and path."""
864        # note HTTP URLs do not have a userinfo component
865        parts = urlsplit(uri)
866        if parts[1]:
867            # URI
868            scheme = parts[0]
869            authority = parts[1]
870            path = parts[2] or '/'
871        else:
872            # host or host:port
873            scheme = None
874            authority = uri
875            path = '/'
876        host, port = _splitport(authority)
877        if default_port and port is None and scheme is not None:
878            dport = {"http": 80,
879                     "https": 443,
880                     }.get(scheme)
881            if dport is not None:
882                authority = "%s:%d" % (host, dport)
883        return authority, path
884
885    def is_suburi(self, base, test):
886        """Check if test is below base in a URI tree
887
888        Both args must be URIs in reduced form.
889        """
890        if base == test:
891            return True
892        if base[0] != test[0]:
893            return False
894        common = posixpath.commonprefix((base[1], test[1]))
895        if len(common) == len(base[1]):
896            return True
897        return False
898
899
900class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
901
902    def find_user_password(self, realm, authuri):
903        user, password = HTTPPasswordMgr.find_user_password(self, realm,
904                                                            authuri)
905        if user is not None:
906            return user, password
907        return HTTPPasswordMgr.find_user_password(self, None, authuri)
908
909
910class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
911
912    def __init__(self, *args, **kwargs):
913        self.authenticated = {}
914        super().__init__(*args, **kwargs)
915
916    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
917        self.update_authenticated(uri, is_authenticated)
918        # Add a default for prior auth requests
919        if realm is not None:
920            super().add_password(None, uri, user, passwd)
921        super().add_password(realm, uri, user, passwd)
922
923    def update_authenticated(self, uri, is_authenticated=False):
924        # uri could be a single URI or a sequence
925        if isinstance(uri, str):
926            uri = [uri]
927
928        for default_port in True, False:
929            for u in uri:
930                reduced_uri = self.reduce_uri(u, default_port)
931                self.authenticated[reduced_uri] = is_authenticated
932
933    def is_authenticated(self, authuri):
934        for default_port in True, False:
935            reduced_authuri = self.reduce_uri(authuri, default_port)
936            for uri in self.authenticated:
937                if self.is_suburi(uri, reduced_authuri):
938                    return self.authenticated[uri]
939
940
941class AbstractBasicAuthHandler:
942
943    # XXX this allows for multiple auth-schemes, but will stupidly pick
944    # the last one with a realm specified.
945
946    # allow for double- and single-quoted realm values
947    # (single quotes are a violation of the RFC, but appear in the wild)
948    rx = re.compile('(?:^|,)'   # start of the string or ','
949                    '[ \t]*'    # optional whitespaces
950                    '([^ \t]+)' # scheme like "Basic"
951                    '[ \t]+'    # mandatory whitespaces
952                    # realm=xxx
953                    # realm='xxx'
954                    # realm="xxx"
955                    'realm=(["\']?)([^"\']*)\\2',
956                    re.I)
957
958    # XXX could pre-emptively send auth info already accepted (RFC 2617,
959    # end of section 2, and section 1.2 immediately after "credentials"
960    # production).
961
962    def __init__(self, password_mgr=None):
963        if password_mgr is None:
964            password_mgr = HTTPPasswordMgr()
965        self.passwd = password_mgr
966        self.add_password = self.passwd.add_password
967
968    def _parse_realm(self, header):
969        # parse WWW-Authenticate header: accept multiple challenges per header
970        found_challenge = False
971        for mo in AbstractBasicAuthHandler.rx.finditer(header):
972            scheme, quote, realm = mo.groups()
973            if quote not in ['"', "'"]:
974                warnings.warn("Basic Auth Realm was unquoted",
975                              UserWarning, 3)
976
977            yield (scheme, realm)
978
979            found_challenge = True
980
981        if not found_challenge:
982            if header:
983                scheme = header.split()[0]
984            else:
985                scheme = ''
986            yield (scheme, None)
987
988    def http_error_auth_reqed(self, authreq, host, req, headers):
989        # host may be an authority (without userinfo) or a URL with an
990        # authority
991        headers = headers.get_all(authreq)
992        if not headers:
993            # no header found
994            return
995
996        unsupported = None
997        for header in headers:
998            for scheme, realm in self._parse_realm(header):
999                if scheme.lower() != 'basic':
1000                    unsupported = scheme
1001                    continue
1002
1003                if realm is not None:
1004                    # Use the first matching Basic challenge.
1005                    # Ignore following challenges even if they use the Basic
1006                    # scheme.
1007                    return self.retry_http_basic_auth(host, req, realm)
1008
1009        if unsupported is not None:
1010            raise ValueError("AbstractBasicAuthHandler does not "
1011                             "support the following scheme: %r"
1012                             % (scheme,))
1013
1014    def retry_http_basic_auth(self, host, req, realm):
1015        user, pw = self.passwd.find_user_password(realm, host)
1016        if pw is not None:
1017            raw = "%s:%s" % (user, pw)
1018            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
1019            if req.get_header(self.auth_header, None) == auth:
1020                return None
1021            req.add_unredirected_header(self.auth_header, auth)
1022            return self.parent.open(req, timeout=req.timeout)
1023        else:
1024            return None
1025
1026    def http_request(self, req):
1027        if (not hasattr(self.passwd, 'is_authenticated') or
1028           not self.passwd.is_authenticated(req.full_url)):
1029            return req
1030
1031        if not req.has_header('Authorization'):
1032            user, passwd = self.passwd.find_user_password(None, req.full_url)
1033            credentials = '{0}:{1}'.format(user, passwd).encode()
1034            auth_str = base64.standard_b64encode(credentials).decode()
1035            req.add_unredirected_header('Authorization',
1036                                        'Basic {}'.format(auth_str.strip()))
1037        return req
1038
1039    def http_response(self, req, response):
1040        if hasattr(self.passwd, 'is_authenticated'):
1041            if 200 <= response.code < 300:
1042                self.passwd.update_authenticated(req.full_url, True)
1043            else:
1044                self.passwd.update_authenticated(req.full_url, False)
1045        return response
1046
1047    https_request = http_request
1048    https_response = http_response
1049
1050
1051
1052class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1053
1054    auth_header = 'Authorization'
1055
1056    def http_error_401(self, req, fp, code, msg, headers):
1057        url = req.full_url
1058        response = self.http_error_auth_reqed('www-authenticate',
1059                                          url, req, headers)
1060        return response
1061
1062
1063class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1064
1065    auth_header = 'Proxy-authorization'
1066
1067    def http_error_407(self, req, fp, code, msg, headers):
1068        # http_error_auth_reqed requires that there is no userinfo component in
1069        # authority.  Assume there isn't one, since urllib.request does not (and
1070        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1071        # userinfo.
1072        authority = req.host
1073        response = self.http_error_auth_reqed('proxy-authenticate',
1074                                          authority, req, headers)
1075        return response
1076
1077
1078# Return n random bytes.
1079_randombytes = os.urandom
1080
1081
1082class AbstractDigestAuthHandler:
1083    # Digest authentication is specified in RFC 2617.
1084
1085    # XXX The client does not inspect the Authentication-Info header
1086    # in a successful response.
1087
1088    # XXX It should be possible to test this implementation against
1089    # a mock server that just generates a static set of challenges.
1090
1091    # XXX qop="auth-int" supports is shaky
1092
1093    def __init__(self, passwd=None):
1094        if passwd is None:
1095            passwd = HTTPPasswordMgr()
1096        self.passwd = passwd
1097        self.add_password = self.passwd.add_password
1098        self.retried = 0
1099        self.nonce_count = 0
1100        self.last_nonce = None
1101
1102    def reset_retry_count(self):
1103        self.retried = 0
1104
1105    def http_error_auth_reqed(self, auth_header, host, req, headers):
1106        authreq = headers.get(auth_header, None)
1107        if self.retried > 5:
1108            # Don't fail endlessly - if we failed once, we'll probably
1109            # fail a second time. Hm. Unless the Password Manager is
1110            # prompting for the information. Crap. This isn't great
1111            # but it's better than the current 'repeat until recursion
1112            # depth exceeded' approach <wink>
1113            raise HTTPError(req.full_url, 401, "digest auth failed",
1114                            headers, None)
1115        else:
1116            self.retried += 1
1117        if authreq:
1118            scheme = authreq.split()[0]
1119            if scheme.lower() == 'digest':
1120                return self.retry_http_digest_auth(req, authreq)
1121            elif scheme.lower() != 'basic':
1122                raise ValueError("AbstractDigestAuthHandler does not support"
1123                                 " the following scheme: '%s'" % scheme)
1124
1125    def retry_http_digest_auth(self, req, auth):
1126        token, challenge = auth.split(' ', 1)
1127        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1128        auth = self.get_authorization(req, chal)
1129        if auth:
1130            auth_val = 'Digest %s' % auth
1131            if req.headers.get(self.auth_header, None) == auth_val:
1132                return None
1133            req.add_unredirected_header(self.auth_header, auth_val)
1134            resp = self.parent.open(req, timeout=req.timeout)
1135            return resp
1136
1137    def get_cnonce(self, nonce):
1138        # The cnonce-value is an opaque
1139        # quoted string value provided by the client and used by both client
1140        # and server to avoid chosen plaintext attacks, to provide mutual
1141        # authentication, and to provide some message integrity protection.
1142        # This isn't a fabulous effort, but it's probably Good Enough.
1143        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1144        b = s.encode("ascii") + _randombytes(8)
1145        dig = hashlib.sha1(b).hexdigest()
1146        return dig[:16]
1147
1148    def get_authorization(self, req, chal):
1149        try:
1150            realm = chal['realm']
1151            nonce = chal['nonce']
1152            qop = chal.get('qop')
1153            algorithm = chal.get('algorithm', 'MD5')
1154            # mod_digest doesn't send an opaque, even though it isn't
1155            # supposed to be optional
1156            opaque = chal.get('opaque', None)
1157        except KeyError:
1158            return None
1159
1160        H, KD = self.get_algorithm_impls(algorithm)
1161        if H is None:
1162            return None
1163
1164        user, pw = self.passwd.find_user_password(realm, req.full_url)
1165        if user is None:
1166            return None
1167
1168        # XXX not implemented yet
1169        if req.data is not None:
1170            entdig = self.get_entity_digest(req.data, chal)
1171        else:
1172            entdig = None
1173
1174        A1 = "%s:%s:%s" % (user, realm, pw)
1175        A2 = "%s:%s" % (req.get_method(),
1176                        # XXX selector: what about proxies and full urls
1177                        req.selector)
1178        # NOTE: As per  RFC 2617, when server sends "auth,auth-int", the client could use either `auth`
1179        #     or `auth-int` to the response back. we use `auth` to send the response back.
1180        if qop is None:
1181            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1182        elif 'auth' in qop.split(','):
1183            if nonce == self.last_nonce:
1184                self.nonce_count += 1
1185            else:
1186                self.nonce_count = 1
1187                self.last_nonce = nonce
1188            ncvalue = '%08x' % self.nonce_count
1189            cnonce = self.get_cnonce(nonce)
1190            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2))
1191            respdig = KD(H(A1), noncebit)
1192        else:
1193            # XXX handle auth-int.
1194            raise URLError("qop '%s' is not supported." % qop)
1195
1196        # XXX should the partial digests be encoded too?
1197
1198        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1199               'response="%s"' % (user, realm, nonce, req.selector,
1200                                  respdig)
1201        if opaque:
1202            base += ', opaque="%s"' % opaque
1203        if entdig:
1204            base += ', digest="%s"' % entdig
1205        base += ', algorithm="%s"' % algorithm
1206        if qop:
1207            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1208        return base
1209
1210    def get_algorithm_impls(self, algorithm):
1211        # lambdas assume digest modules are imported at the top level
1212        if algorithm == 'MD5':
1213            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1214        elif algorithm == 'SHA':
1215            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1216        # XXX MD5-sess
1217        else:
1218            raise ValueError("Unsupported digest authentication "
1219                             "algorithm %r" % algorithm)
1220        KD = lambda s, d: H("%s:%s" % (s, d))
1221        return H, KD
1222
1223    def get_entity_digest(self, data, chal):
1224        # XXX not implemented yet
1225        return None
1226
1227
1228class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1229    """An authentication protocol defined by RFC 2069
1230
1231    Digest authentication improves on basic authentication because it
1232    does not transmit passwords in the clear.
1233    """
1234
1235    auth_header = 'Authorization'
1236    handler_order = 490  # before Basic auth
1237
1238    def http_error_401(self, req, fp, code, msg, headers):
1239        host = urlparse(req.full_url)[1]
1240        retry = self.http_error_auth_reqed('www-authenticate',
1241                                           host, req, headers)
1242        self.reset_retry_count()
1243        return retry
1244
1245
1246class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1247
1248    auth_header = 'Proxy-Authorization'
1249    handler_order = 490  # before Basic auth
1250
1251    def http_error_407(self, req, fp, code, msg, headers):
1252        host = req.host
1253        retry = self.http_error_auth_reqed('proxy-authenticate',
1254                                           host, req, headers)
1255        self.reset_retry_count()
1256        return retry
1257
1258class AbstractHTTPHandler(BaseHandler):
1259
1260    def __init__(self, debuglevel=0):
1261        self._debuglevel = debuglevel
1262
1263    def set_http_debuglevel(self, level):
1264        self._debuglevel = level
1265
1266    def _get_content_length(self, request):
1267        return http.client.HTTPConnection._get_content_length(
1268            request.data,
1269            request.get_method())
1270
1271    def do_request_(self, request):
1272        host = request.host
1273        if not host:
1274            raise URLError('no host given')
1275
1276        if request.data is not None:  # POST
1277            data = request.data
1278            if isinstance(data, str):
1279                msg = "POST data should be bytes, an iterable of bytes, " \
1280                      "or a file object. It cannot be of type str."
1281                raise TypeError(msg)
1282            if not request.has_header('Content-type'):
1283                request.add_unredirected_header(
1284                    'Content-type',
1285                    'application/x-www-form-urlencoded')
1286            if (not request.has_header('Content-length')
1287                    and not request.has_header('Transfer-encoding')):
1288                content_length = self._get_content_length(request)
1289                if content_length is not None:
1290                    request.add_unredirected_header(
1291                            'Content-length', str(content_length))
1292                else:
1293                    request.add_unredirected_header(
1294                            'Transfer-encoding', 'chunked')
1295
1296        sel_host = host
1297        if request.has_proxy():
1298            scheme, sel = _splittype(request.selector)
1299            sel_host, sel_path = _splithost(sel)
1300        if not request.has_header('Host'):
1301            request.add_unredirected_header('Host', sel_host)
1302        for name, value in self.parent.addheaders:
1303            name = name.capitalize()
1304            if not request.has_header(name):
1305                request.add_unredirected_header(name, value)
1306
1307        return request
1308
1309    def do_open(self, http_class, req, **http_conn_args):
1310        """Return an HTTPResponse object for the request, using http_class.
1311
1312        http_class must implement the HTTPConnection API from http.client.
1313        """
1314        host = req.host
1315        if not host:
1316            raise URLError('no host given')
1317
1318        # will parse host:port
1319        h = http_class(host, timeout=req.timeout, **http_conn_args)
1320        h.set_debuglevel(self._debuglevel)
1321
1322        headers = dict(req.unredirected_hdrs)
1323        headers.update({k: v for k, v in req.headers.items()
1324                        if k not in headers})
1325
1326        # TODO(jhylton): Should this be redesigned to handle
1327        # persistent connections?
1328
1329        # We want to make an HTTP/1.1 request, but the addinfourl
1330        # class isn't prepared to deal with a persistent connection.
1331        # It will try to read all remaining data from the socket,
1332        # which will block while the server waits for the next request.
1333        # So make sure the connection gets closed after the (only)
1334        # request.
1335        headers["Connection"] = "close"
1336        headers = {name.title(): val for name, val in headers.items()}
1337
1338        if req._tunnel_host:
1339            tunnel_headers = {}
1340            proxy_auth_hdr = "Proxy-Authorization"
1341            if proxy_auth_hdr in headers:
1342                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1343                # Proxy-Authorization should not be sent to origin
1344                # server.
1345                del headers[proxy_auth_hdr]
1346            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1347
1348        try:
1349            try:
1350                h.request(req.get_method(), req.selector, req.data, headers,
1351                          encode_chunked=req.has_header('Transfer-encoding'))
1352            except OSError as err: # timeout error
1353                raise URLError(err)
1354            r = h.getresponse()
1355        except:
1356            h.close()
1357            raise
1358
1359        # If the server does not send us a 'Connection: close' header,
1360        # HTTPConnection assumes the socket should be left open. Manually
1361        # mark the socket to be closed when this response object goes away.
1362        if h.sock:
1363            h.sock.close()
1364            h.sock = None
1365
1366        r.url = req.get_full_url()
1367        # This line replaces the .msg attribute of the HTTPResponse
1368        # with .headers, because urllib clients expect the response to
1369        # have the reason in .msg.  It would be good to mark this
1370        # attribute is deprecated and get then to use info() or
1371        # .headers.
1372        r.msg = r.reason
1373        return r
1374
1375
1376class HTTPHandler(AbstractHTTPHandler):
1377
1378    def http_open(self, req):
1379        return self.do_open(http.client.HTTPConnection, req)
1380
1381    http_request = AbstractHTTPHandler.do_request_
1382
1383if hasattr(http.client, 'HTTPSConnection'):
1384
1385    class HTTPSHandler(AbstractHTTPHandler):
1386
1387        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1388            AbstractHTTPHandler.__init__(self, debuglevel)
1389            self._context = context
1390            self._check_hostname = check_hostname
1391
1392        def https_open(self, req):
1393            return self.do_open(http.client.HTTPSConnection, req,
1394                context=self._context, check_hostname=self._check_hostname)
1395
1396        https_request = AbstractHTTPHandler.do_request_
1397
1398    __all__.append('HTTPSHandler')
1399
1400class HTTPCookieProcessor(BaseHandler):
1401    def __init__(self, cookiejar=None):
1402        import http.cookiejar
1403        if cookiejar is None:
1404            cookiejar = http.cookiejar.CookieJar()
1405        self.cookiejar = cookiejar
1406
1407    def http_request(self, request):
1408        self.cookiejar.add_cookie_header(request)
1409        return request
1410
1411    def http_response(self, request, response):
1412        self.cookiejar.extract_cookies(response, request)
1413        return response
1414
1415    https_request = http_request
1416    https_response = http_response
1417
1418class UnknownHandler(BaseHandler):
1419    def unknown_open(self, req):
1420        type = req.type
1421        raise URLError('unknown url type: %s' % type)
1422
1423def parse_keqv_list(l):
1424    """Parse list of key=value strings where keys are not duplicated."""
1425    parsed = {}
1426    for elt in l:
1427        k, v = elt.split('=', 1)
1428        if v[0] == '"' and v[-1] == '"':
1429            v = v[1:-1]
1430        parsed[k] = v
1431    return parsed
1432
1433def parse_http_list(s):
1434    """Parse lists as described by RFC 2068 Section 2.
1435
1436    In particular, parse comma-separated lists where the elements of
1437    the list may include quoted-strings.  A quoted-string could
1438    contain a comma.  A non-quoted string could have quotes in the
1439    middle.  Neither commas nor quotes count if they are escaped.
1440    Only double-quotes count, not single-quotes.
1441    """
1442    res = []
1443    part = ''
1444
1445    escape = quote = False
1446    for cur in s:
1447        if escape:
1448            part += cur
1449            escape = False
1450            continue
1451        if quote:
1452            if cur == '\\':
1453                escape = True
1454                continue
1455            elif cur == '"':
1456                quote = False
1457            part += cur
1458            continue
1459
1460        if cur == ',':
1461            res.append(part)
1462            part = ''
1463            continue
1464
1465        if cur == '"':
1466            quote = True
1467
1468        part += cur
1469
1470    # append last part
1471    if part:
1472        res.append(part)
1473
1474    return [part.strip() for part in res]
1475
1476class FileHandler(BaseHandler):
1477    # Use local file or FTP depending on form of URL
1478    def file_open(self, req):
1479        url = req.selector
1480        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1481                req.host != 'localhost'):
1482            if not req.host in self.get_names():
1483                raise URLError("file:// scheme is supported only on localhost")
1484        else:
1485            return self.open_local_file(req)
1486
1487    # names for the localhost
1488    names = None
1489    def get_names(self):
1490        if FileHandler.names is None:
1491            try:
1492                FileHandler.names = tuple(
1493                    socket.gethostbyname_ex('localhost')[2] +
1494                    socket.gethostbyname_ex(socket.gethostname())[2])
1495            except socket.gaierror:
1496                FileHandler.names = (socket.gethostbyname('localhost'),)
1497        return FileHandler.names
1498
1499    # not entirely sure what the rules are here
1500    def open_local_file(self, req):
1501        import email.utils
1502        import mimetypes
1503        host = req.host
1504        filename = req.selector
1505        localfile = url2pathname(filename)
1506        try:
1507            stats = os.stat(localfile)
1508            size = stats.st_size
1509            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1510            mtype = mimetypes.guess_type(filename)[0]
1511            headers = email.message_from_string(
1512                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1513                (mtype or 'text/plain', size, modified))
1514            if host:
1515                host, port = _splitport(host)
1516            if not host or \
1517                (not port and _safe_gethostbyname(host) in self.get_names()):
1518                if host:
1519                    origurl = 'file://' + host + filename
1520                else:
1521                    origurl = 'file://' + filename
1522                return addinfourl(open(localfile, 'rb'), headers, origurl)
1523        except OSError as exp:
1524            raise URLError(exp)
1525        raise URLError('file not on local host')
1526
1527def _safe_gethostbyname(host):
1528    try:
1529        return socket.gethostbyname(host)
1530    except socket.gaierror:
1531        return None
1532
1533class FTPHandler(BaseHandler):
1534    def ftp_open(self, req):
1535        import ftplib
1536        import mimetypes
1537        host = req.host
1538        if not host:
1539            raise URLError('ftp error: no host given')
1540        host, port = _splitport(host)
1541        if port is None:
1542            port = ftplib.FTP_PORT
1543        else:
1544            port = int(port)
1545
1546        # username/password handling
1547        user, host = _splituser(host)
1548        if user:
1549            user, passwd = _splitpasswd(user)
1550        else:
1551            passwd = None
1552        host = unquote(host)
1553        user = user or ''
1554        passwd = passwd or ''
1555
1556        try:
1557            host = socket.gethostbyname(host)
1558        except OSError as msg:
1559            raise URLError(msg)
1560        path, attrs = _splitattr(req.selector)
1561        dirs = path.split('/')
1562        dirs = list(map(unquote, dirs))
1563        dirs, file = dirs[:-1], dirs[-1]
1564        if dirs and not dirs[0]:
1565            dirs = dirs[1:]
1566        try:
1567            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1568            type = file and 'I' or 'D'
1569            for attr in attrs:
1570                attr, value = _splitvalue(attr)
1571                if attr.lower() == 'type' and \
1572                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1573                    type = value.upper()
1574            fp, retrlen = fw.retrfile(file, type)
1575            headers = ""
1576            mtype = mimetypes.guess_type(req.full_url)[0]
1577            if mtype:
1578                headers += "Content-type: %s\n" % mtype
1579            if retrlen is not None and retrlen >= 0:
1580                headers += "Content-length: %d\n" % retrlen
1581            headers = email.message_from_string(headers)
1582            return addinfourl(fp, headers, req.full_url)
1583        except ftplib.all_errors as exp:
1584            exc = URLError('ftp error: %r' % exp)
1585            raise exc.with_traceback(sys.exc_info()[2])
1586
1587    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1588        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1589                          persistent=False)
1590
1591class CacheFTPHandler(FTPHandler):
1592    # XXX would be nice to have pluggable cache strategies
1593    # XXX this stuff is definitely not thread safe
1594    def __init__(self):
1595        self.cache = {}
1596        self.timeout = {}
1597        self.soonest = 0
1598        self.delay = 60
1599        self.max_conns = 16
1600
1601    def setTimeout(self, t):
1602        self.delay = t
1603
1604    def setMaxConns(self, m):
1605        self.max_conns = m
1606
1607    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1608        key = user, host, port, '/'.join(dirs), timeout
1609        if key in self.cache:
1610            self.timeout[key] = time.time() + self.delay
1611        else:
1612            self.cache[key] = ftpwrapper(user, passwd, host, port,
1613                                         dirs, timeout)
1614            self.timeout[key] = time.time() + self.delay
1615        self.check_cache()
1616        return self.cache[key]
1617
1618    def check_cache(self):
1619        # first check for old ones
1620        t = time.time()
1621        if self.soonest <= t:
1622            for k, v in list(self.timeout.items()):
1623                if v < t:
1624                    self.cache[k].close()
1625                    del self.cache[k]
1626                    del self.timeout[k]
1627        self.soonest = min(list(self.timeout.values()))
1628
1629        # then check the size
1630        if len(self.cache) == self.max_conns:
1631            for k, v in list(self.timeout.items()):
1632                if v == self.soonest:
1633                    del self.cache[k]
1634                    del self.timeout[k]
1635                    break
1636            self.soonest = min(list(self.timeout.values()))
1637
1638    def clear_cache(self):
1639        for conn in self.cache.values():
1640            conn.close()
1641        self.cache.clear()
1642        self.timeout.clear()
1643
1644class DataHandler(BaseHandler):
1645    def data_open(self, req):
1646        # data URLs as specified in RFC 2397.
1647        #
1648        # ignores POSTed data
1649        #
1650        # syntax:
1651        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1652        # mediatype := [ type "/" subtype ] *( ";" parameter )
1653        # data      := *urlchar
1654        # parameter := attribute "=" value
1655        url = req.full_url
1656
1657        scheme, data = url.split(":",1)
1658        mediatype, data = data.split(",",1)
1659
1660        # even base64 encoded data URLs might be quoted so unquote in any case:
1661        data = unquote_to_bytes(data)
1662        if mediatype.endswith(";base64"):
1663            data = base64.decodebytes(data)
1664            mediatype = mediatype[:-7]
1665
1666        if not mediatype:
1667            mediatype = "text/plain;charset=US-ASCII"
1668
1669        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1670            (mediatype, len(data)))
1671
1672        return addinfourl(io.BytesIO(data), headers, url)
1673
1674
1675# Code move from the old urllib module
1676
1677MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1678
1679# Helper for non-unix systems
1680if os.name == 'nt':
1681    from nturl2path import url2pathname, pathname2url
1682else:
1683    def url2pathname(pathname):
1684        """OS-specific conversion from a relative URL of the 'file' scheme
1685        to a file system path; not recommended for general use."""
1686        return unquote(pathname)
1687
1688    def pathname2url(pathname):
1689        """OS-specific conversion from a file system path to a relative URL
1690        of the 'file' scheme; not recommended for general use."""
1691        return quote(pathname)
1692
1693
1694ftpcache = {}
1695
1696
1697class URLopener:
1698    """Class to open URLs.
1699    This is a class rather than just a subroutine because we may need
1700    more than one set of global protocol-specific options.
1701    Note -- this is a base class for those who don't want the
1702    automatic handling of errors type 302 (relocated) and 401
1703    (authorization needed)."""
1704
1705    __tempfiles = None
1706
1707    version = "Python-urllib/%s" % __version__
1708
1709    # Constructor
1710    def __init__(self, proxies=None, **x509):
1711        msg = "%(class)s style of invoking requests is deprecated. " \
1712              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1713        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1714        if proxies is None:
1715            proxies = getproxies()
1716        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1717        self.proxies = proxies
1718        self.key_file = x509.get('key_file')
1719        self.cert_file = x509.get('cert_file')
1720        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1721        self.__tempfiles = []
1722        self.__unlink = os.unlink # See cleanup()
1723        self.tempcache = None
1724        # Undocumented feature: if you assign {} to tempcache,
1725        # it is used to cache files retrieved with
1726        # self.retrieve().  This is not enabled by default
1727        # since it does not work for changing documents (and I
1728        # haven't got the logic to check expiration headers
1729        # yet).
1730        self.ftpcache = ftpcache
1731        # Undocumented feature: you can use a different
1732        # ftp cache by assigning to the .ftpcache member;
1733        # in case you want logically independent URL openers
1734        # XXX This is not threadsafe.  Bah.
1735
1736    def __del__(self):
1737        self.close()
1738
1739    def close(self):
1740        self.cleanup()
1741
1742    def cleanup(self):
1743        # This code sometimes runs when the rest of this module
1744        # has already been deleted, so it can't use any globals
1745        # or import anything.
1746        if self.__tempfiles:
1747            for file in self.__tempfiles:
1748                try:
1749                    self.__unlink(file)
1750                except OSError:
1751                    pass
1752            del self.__tempfiles[:]
1753        if self.tempcache:
1754            self.tempcache.clear()
1755
1756    def addheader(self, *args):
1757        """Add a header to be used by the HTTP interface only
1758        e.g. u.addheader('Accept', 'sound/basic')"""
1759        self.addheaders.append(args)
1760
1761    # External interface
1762    def open(self, fullurl, data=None):
1763        """Use URLopener().open(file) instead of open(file, 'r')."""
1764        fullurl = unwrap(_to_bytes(fullurl))
1765        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1766        if self.tempcache and fullurl in self.tempcache:
1767            filename, headers = self.tempcache[fullurl]
1768            fp = open(filename, 'rb')
1769            return addinfourl(fp, headers, fullurl)
1770        urltype, url = _splittype(fullurl)
1771        if not urltype:
1772            urltype = 'file'
1773        if urltype in self.proxies:
1774            proxy = self.proxies[urltype]
1775            urltype, proxyhost = _splittype(proxy)
1776            host, selector = _splithost(proxyhost)
1777            url = (host, fullurl) # Signal special case to open_*()
1778        else:
1779            proxy = None
1780        name = 'open_' + urltype
1781        self.type = urltype
1782        name = name.replace('-', '_')
1783        if not hasattr(self, name) or name == 'open_local_file':
1784            if proxy:
1785                return self.open_unknown_proxy(proxy, fullurl, data)
1786            else:
1787                return self.open_unknown(fullurl, data)
1788        try:
1789            if data is None:
1790                return getattr(self, name)(url)
1791            else:
1792                return getattr(self, name)(url, data)
1793        except (HTTPError, URLError):
1794            raise
1795        except OSError as msg:
1796            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1797
1798    def open_unknown(self, fullurl, data=None):
1799        """Overridable interface to open unknown URL type."""
1800        type, url = _splittype(fullurl)
1801        raise OSError('url error', 'unknown url type', type)
1802
1803    def open_unknown_proxy(self, proxy, fullurl, data=None):
1804        """Overridable interface to open unknown URL type."""
1805        type, url = _splittype(fullurl)
1806        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1807
1808    # External interface
1809    def retrieve(self, url, filename=None, reporthook=None, data=None):
1810        """retrieve(url) returns (filename, headers) for a local object
1811        or (tempfilename, headers) for a remote object."""
1812        url = unwrap(_to_bytes(url))
1813        if self.tempcache and url in self.tempcache:
1814            return self.tempcache[url]
1815        type, url1 = _splittype(url)
1816        if filename is None and (not type or type == 'file'):
1817            try:
1818                fp = self.open_local_file(url1)
1819                hdrs = fp.info()
1820                fp.close()
1821                return url2pathname(_splithost(url1)[1]), hdrs
1822            except OSError as msg:
1823                pass
1824        fp = self.open(url, data)
1825        try:
1826            headers = fp.info()
1827            if filename:
1828                tfp = open(filename, 'wb')
1829            else:
1830                garbage, path = _splittype(url)
1831                garbage, path = _splithost(path or "")
1832                path, garbage = _splitquery(path or "")
1833                path, garbage = _splitattr(path or "")
1834                suffix = os.path.splitext(path)[1]
1835                (fd, filename) = tempfile.mkstemp(suffix)
1836                self.__tempfiles.append(filename)
1837                tfp = os.fdopen(fd, 'wb')
1838            try:
1839                result = filename, headers
1840                if self.tempcache is not None:
1841                    self.tempcache[url] = result
1842                bs = 1024*8
1843                size = -1
1844                read = 0
1845                blocknum = 0
1846                if "content-length" in headers:
1847                    size = int(headers["Content-Length"])
1848                if reporthook:
1849                    reporthook(blocknum, bs, size)
1850                while 1:
1851                    block = fp.read(bs)
1852                    if not block:
1853                        break
1854                    read += len(block)
1855                    tfp.write(block)
1856                    blocknum += 1
1857                    if reporthook:
1858                        reporthook(blocknum, bs, size)
1859            finally:
1860                tfp.close()
1861        finally:
1862            fp.close()
1863
1864        # raise exception if actual size does not match content-length header
1865        if size >= 0 and read < size:
1866            raise ContentTooShortError(
1867                "retrieval incomplete: got only %i out of %i bytes"
1868                % (read, size), result)
1869
1870        return result
1871
1872    # Each method named open_<type> knows how to open that type of URL
1873
1874    def _open_generic_http(self, connection_factory, url, data):
1875        """Make an HTTP connection using connection_class.
1876
1877        This is an internal method that should be called from
1878        open_http() or open_https().
1879
1880        Arguments:
1881        - connection_factory should take a host name and return an
1882          HTTPConnection instance.
1883        - url is the url to retrieval or a host, relative-path pair.
1884        - data is payload for a POST request or None.
1885        """
1886
1887        user_passwd = None
1888        proxy_passwd= None
1889        if isinstance(url, str):
1890            host, selector = _splithost(url)
1891            if host:
1892                user_passwd, host = _splituser(host)
1893                host = unquote(host)
1894            realhost = host
1895        else:
1896            host, selector = url
1897            # check whether the proxy contains authorization information
1898            proxy_passwd, host = _splituser(host)
1899            # now we proceed with the url we want to obtain
1900            urltype, rest = _splittype(selector)
1901            url = rest
1902            user_passwd = None
1903            if urltype.lower() != 'http':
1904                realhost = None
1905            else:
1906                realhost, rest = _splithost(rest)
1907                if realhost:
1908                    user_passwd, realhost = _splituser(realhost)
1909                if user_passwd:
1910                    selector = "%s://%s%s" % (urltype, realhost, rest)
1911                if proxy_bypass(realhost):
1912                    host = realhost
1913
1914        if not host: raise OSError('http error', 'no host given')
1915
1916        if proxy_passwd:
1917            proxy_passwd = unquote(proxy_passwd)
1918            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1919        else:
1920            proxy_auth = None
1921
1922        if user_passwd:
1923            user_passwd = unquote(user_passwd)
1924            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1925        else:
1926            auth = None
1927        http_conn = connection_factory(host)
1928        headers = {}
1929        if proxy_auth:
1930            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1931        if auth:
1932            headers["Authorization"] =  "Basic %s" % auth
1933        if realhost:
1934            headers["Host"] = realhost
1935
1936        # Add Connection:close as we don't support persistent connections yet.
1937        # This helps in closing the socket and avoiding ResourceWarning
1938
1939        headers["Connection"] = "close"
1940
1941        for header, value in self.addheaders:
1942            headers[header] = value
1943
1944        if data is not None:
1945            headers["Content-Type"] = "application/x-www-form-urlencoded"
1946            http_conn.request("POST", selector, data, headers)
1947        else:
1948            http_conn.request("GET", selector, headers=headers)
1949
1950        try:
1951            response = http_conn.getresponse()
1952        except http.client.BadStatusLine:
1953            # something went wrong with the HTTP status line
1954            raise URLError("http protocol error: bad status line")
1955
1956        # According to RFC 2616, "2xx" code indicates that the client's
1957        # request was successfully received, understood, and accepted.
1958        if 200 <= response.status < 300:
1959            return addinfourl(response, response.msg, "http:" + url,
1960                              response.status)
1961        else:
1962            return self.http_error(
1963                url, response.fp,
1964                response.status, response.reason, response.msg, data)
1965
1966    def open_http(self, url, data=None):
1967        """Use HTTP protocol."""
1968        return self._open_generic_http(http.client.HTTPConnection, url, data)
1969
1970    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1971        """Handle http errors.
1972
1973        Derived class can override this, or provide specific handlers
1974        named http_error_DDD where DDD is the 3-digit error code."""
1975        # First check if there's a specific handler for this error
1976        name = 'http_error_%d' % errcode
1977        if hasattr(self, name):
1978            method = getattr(self, name)
1979            if data is None:
1980                result = method(url, fp, errcode, errmsg, headers)
1981            else:
1982                result = method(url, fp, errcode, errmsg, headers, data)
1983            if result: return result
1984        return self.http_error_default(url, fp, errcode, errmsg, headers)
1985
1986    def http_error_default(self, url, fp, errcode, errmsg, headers):
1987        """Default error handler: close the connection and raise OSError."""
1988        fp.close()
1989        raise HTTPError(url, errcode, errmsg, headers, None)
1990
1991    if _have_ssl:
1992        def _https_connection(self, host):
1993            return http.client.HTTPSConnection(host,
1994                                           key_file=self.key_file,
1995                                           cert_file=self.cert_file)
1996
1997        def open_https(self, url, data=None):
1998            """Use HTTPS protocol."""
1999            return self._open_generic_http(self._https_connection, url, data)
2000
2001    def open_file(self, url):
2002        """Use local file or FTP depending on form of URL."""
2003        if not isinstance(url, str):
2004            raise URLError('file error: proxy support for file protocol currently not implemented')
2005        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
2006            raise ValueError("file:// scheme is supported only on localhost")
2007        else:
2008            return self.open_local_file(url)
2009
2010    def open_local_file(self, url):
2011        """Use local file."""
2012        import email.utils
2013        import mimetypes
2014        host, file = _splithost(url)
2015        localname = url2pathname(file)
2016        try:
2017            stats = os.stat(localname)
2018        except OSError as e:
2019            raise URLError(e.strerror, e.filename)
2020        size = stats.st_size
2021        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
2022        mtype = mimetypes.guess_type(url)[0]
2023        headers = email.message_from_string(
2024            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
2025            (mtype or 'text/plain', size, modified))
2026        if not host:
2027            urlfile = file
2028            if file[:1] == '/':
2029                urlfile = 'file://' + file
2030            return addinfourl(open(localname, 'rb'), headers, urlfile)
2031        host, port = _splitport(host)
2032        if (not port
2033           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2034            urlfile = file
2035            if file[:1] == '/':
2036                urlfile = 'file://' + file
2037            elif file[:2] == './':
2038                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2039            return addinfourl(open(localname, 'rb'), headers, urlfile)
2040        raise URLError('local file error: not on local host')
2041
2042    def open_ftp(self, url):
2043        """Use FTP protocol."""
2044        if not isinstance(url, str):
2045            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2046        import mimetypes
2047        host, path = _splithost(url)
2048        if not host: raise URLError('ftp error: no host given')
2049        host, port = _splitport(host)
2050        user, host = _splituser(host)
2051        if user: user, passwd = _splitpasswd(user)
2052        else: passwd = None
2053        host = unquote(host)
2054        user = unquote(user or '')
2055        passwd = unquote(passwd or '')
2056        host = socket.gethostbyname(host)
2057        if not port:
2058            import ftplib
2059            port = ftplib.FTP_PORT
2060        else:
2061            port = int(port)
2062        path, attrs = _splitattr(path)
2063        path = unquote(path)
2064        dirs = path.split('/')
2065        dirs, file = dirs[:-1], dirs[-1]
2066        if dirs and not dirs[0]: dirs = dirs[1:]
2067        if dirs and not dirs[0]: dirs[0] = '/'
2068        key = user, host, port, '/'.join(dirs)
2069        # XXX thread unsafe!
2070        if len(self.ftpcache) > MAXFTPCACHE:
2071            # Prune the cache, rather arbitrarily
2072            for k in list(self.ftpcache):
2073                if k != key:
2074                    v = self.ftpcache[k]
2075                    del self.ftpcache[k]
2076                    v.close()
2077        try:
2078            if key not in self.ftpcache:
2079                self.ftpcache[key] = \
2080                    ftpwrapper(user, passwd, host, port, dirs)
2081            if not file: type = 'D'
2082            else: type = 'I'
2083            for attr in attrs:
2084                attr, value = _splitvalue(attr)
2085                if attr.lower() == 'type' and \
2086                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2087                    type = value.upper()
2088            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2089            mtype = mimetypes.guess_type("ftp:" + url)[0]
2090            headers = ""
2091            if mtype:
2092                headers += "Content-Type: %s\n" % mtype
2093            if retrlen is not None and retrlen >= 0:
2094                headers += "Content-Length: %d\n" % retrlen
2095            headers = email.message_from_string(headers)
2096            return addinfourl(fp, headers, "ftp:" + url)
2097        except ftperrors() as exp:
2098            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2099
2100    def open_data(self, url, data=None):
2101        """Use "data" URL."""
2102        if not isinstance(url, str):
2103            raise URLError('data error: proxy support for data protocol currently not implemented')
2104        # ignore POSTed data
2105        #
2106        # syntax of data URLs:
2107        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2108        # mediatype := [ type "/" subtype ] *( ";" parameter )
2109        # data      := *urlchar
2110        # parameter := attribute "=" value
2111        try:
2112            [type, data] = url.split(',', 1)
2113        except ValueError:
2114            raise OSError('data error', 'bad data URL')
2115        if not type:
2116            type = 'text/plain;charset=US-ASCII'
2117        semi = type.rfind(';')
2118        if semi >= 0 and '=' not in type[semi:]:
2119            encoding = type[semi+1:]
2120            type = type[:semi]
2121        else:
2122            encoding = ''
2123        msg = []
2124        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2125                                            time.gmtime(time.time())))
2126        msg.append('Content-type: %s' % type)
2127        if encoding == 'base64':
2128            # XXX is this encoding/decoding ok?
2129            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2130        else:
2131            data = unquote(data)
2132        msg.append('Content-Length: %d' % len(data))
2133        msg.append('')
2134        msg.append(data)
2135        msg = '\n'.join(msg)
2136        headers = email.message_from_string(msg)
2137        f = io.StringIO(msg)
2138        #f.fileno = None     # needed for addinfourl
2139        return addinfourl(f, headers, url)
2140
2141
2142class FancyURLopener(URLopener):
2143    """Derived class with handlers for errors we can handle (perhaps)."""
2144
2145    def __init__(self, *args, **kwargs):
2146        URLopener.__init__(self, *args, **kwargs)
2147        self.auth_cache = {}
2148        self.tries = 0
2149        self.maxtries = 10
2150
2151    def http_error_default(self, url, fp, errcode, errmsg, headers):
2152        """Default error handling -- don't raise an exception."""
2153        return addinfourl(fp, headers, "http:" + url, errcode)
2154
2155    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2156        """Error 302 -- relocated (temporarily)."""
2157        self.tries += 1
2158        try:
2159            if self.maxtries and self.tries >= self.maxtries:
2160                if hasattr(self, "http_error_500"):
2161                    meth = self.http_error_500
2162                else:
2163                    meth = self.http_error_default
2164                return meth(url, fp, 500,
2165                            "Internal Server Error: Redirect Recursion",
2166                            headers)
2167            result = self.redirect_internal(url, fp, errcode, errmsg,
2168                                            headers, data)
2169            return result
2170        finally:
2171            self.tries = 0
2172
2173    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2174        if 'location' in headers:
2175            newurl = headers['location']
2176        elif 'uri' in headers:
2177            newurl = headers['uri']
2178        else:
2179            return
2180        fp.close()
2181
2182        # In case the server sent a relative URL, join with original:
2183        newurl = urljoin(self.type + ":" + url, newurl)
2184
2185        urlparts = urlparse(newurl)
2186
2187        # For security reasons, we don't allow redirection to anything other
2188        # than http, https and ftp.
2189
2190        # We are using newer HTTPError with older redirect_internal method
2191        # This older method will get deprecated in 3.3
2192
2193        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2194            raise HTTPError(newurl, errcode,
2195                            errmsg +
2196                            " Redirection to url '%s' is not allowed." % newurl,
2197                            headers, fp)
2198
2199        return self.open(newurl)
2200
2201    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2202        """Error 301 -- also relocated (permanently)."""
2203        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2204
2205    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2206        """Error 303 -- also relocated (essentially identical to 302)."""
2207        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2208
2209    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2210        """Error 307 -- relocated, but turn POST into error."""
2211        if data is None:
2212            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2213        else:
2214            return self.http_error_default(url, fp, errcode, errmsg, headers)
2215
2216    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2217            retry=False):
2218        """Error 401 -- authentication required.
2219        This function supports Basic authentication only."""
2220        if 'www-authenticate' not in headers:
2221            URLopener.http_error_default(self, url, fp,
2222                                         errcode, errmsg, headers)
2223        stuff = headers['www-authenticate']
2224        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2225        if not match:
2226            URLopener.http_error_default(self, url, fp,
2227                                         errcode, errmsg, headers)
2228        scheme, realm = match.groups()
2229        if scheme.lower() != 'basic':
2230            URLopener.http_error_default(self, url, fp,
2231                                         errcode, errmsg, headers)
2232        if not retry:
2233            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2234                    headers)
2235        name = 'retry_' + self.type + '_basic_auth'
2236        if data is None:
2237            return getattr(self,name)(url, realm)
2238        else:
2239            return getattr(self,name)(url, realm, data)
2240
2241    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2242            retry=False):
2243        """Error 407 -- proxy authentication required.
2244        This function supports Basic authentication only."""
2245        if 'proxy-authenticate' not in headers:
2246            URLopener.http_error_default(self, url, fp,
2247                                         errcode, errmsg, headers)
2248        stuff = headers['proxy-authenticate']
2249        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2250        if not match:
2251            URLopener.http_error_default(self, url, fp,
2252                                         errcode, errmsg, headers)
2253        scheme, realm = match.groups()
2254        if scheme.lower() != 'basic':
2255            URLopener.http_error_default(self, url, fp,
2256                                         errcode, errmsg, headers)
2257        if not retry:
2258            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2259                    headers)
2260        name = 'retry_proxy_' + self.type + '_basic_auth'
2261        if data is None:
2262            return getattr(self,name)(url, realm)
2263        else:
2264            return getattr(self,name)(url, realm, data)
2265
2266    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2267        host, selector = _splithost(url)
2268        newurl = 'http://' + host + selector
2269        proxy = self.proxies['http']
2270        urltype, proxyhost = _splittype(proxy)
2271        proxyhost, proxyselector = _splithost(proxyhost)
2272        i = proxyhost.find('@') + 1
2273        proxyhost = proxyhost[i:]
2274        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2275        if not (user or passwd): return None
2276        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2277                                  quote(passwd, safe=''), proxyhost)
2278        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2279        if data is None:
2280            return self.open(newurl)
2281        else:
2282            return self.open(newurl, data)
2283
2284    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2285        host, selector = _splithost(url)
2286        newurl = 'https://' + host + selector
2287        proxy = self.proxies['https']
2288        urltype, proxyhost = _splittype(proxy)
2289        proxyhost, proxyselector = _splithost(proxyhost)
2290        i = proxyhost.find('@') + 1
2291        proxyhost = proxyhost[i:]
2292        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2293        if not (user or passwd): return None
2294        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2295                                  quote(passwd, safe=''), proxyhost)
2296        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2297        if data is None:
2298            return self.open(newurl)
2299        else:
2300            return self.open(newurl, data)
2301
2302    def retry_http_basic_auth(self, url, realm, data=None):
2303        host, selector = _splithost(url)
2304        i = host.find('@') + 1
2305        host = host[i:]
2306        user, passwd = self.get_user_passwd(host, realm, i)
2307        if not (user or passwd): return None
2308        host = "%s:%s@%s" % (quote(user, safe=''),
2309                             quote(passwd, safe=''), host)
2310        newurl = 'http://' + host + selector
2311        if data is None:
2312            return self.open(newurl)
2313        else:
2314            return self.open(newurl, data)
2315
2316    def retry_https_basic_auth(self, url, realm, data=None):
2317        host, selector = _splithost(url)
2318        i = host.find('@') + 1
2319        host = host[i:]
2320        user, passwd = self.get_user_passwd(host, realm, i)
2321        if not (user or passwd): return None
2322        host = "%s:%s@%s" % (quote(user, safe=''),
2323                             quote(passwd, safe=''), host)
2324        newurl = 'https://' + host + selector
2325        if data is None:
2326            return self.open(newurl)
2327        else:
2328            return self.open(newurl, data)
2329
2330    def get_user_passwd(self, host, realm, clear_cache=0):
2331        key = realm + '@' + host.lower()
2332        if key in self.auth_cache:
2333            if clear_cache:
2334                del self.auth_cache[key]
2335            else:
2336                return self.auth_cache[key]
2337        user, passwd = self.prompt_user_passwd(host, realm)
2338        if user or passwd: self.auth_cache[key] = (user, passwd)
2339        return user, passwd
2340
2341    def prompt_user_passwd(self, host, realm):
2342        """Override this in a GUI environment!"""
2343        import getpass
2344        try:
2345            user = input("Enter username for %s at %s: " % (realm, host))
2346            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2347                (user, realm, host))
2348            return user, passwd
2349        except KeyboardInterrupt:
2350            print()
2351            return None, None
2352
2353
2354# Utility functions
2355
2356_localhost = None
2357def localhost():
2358    """Return the IP address of the magic hostname 'localhost'."""
2359    global _localhost
2360    if _localhost is None:
2361        _localhost = socket.gethostbyname('localhost')
2362    return _localhost
2363
2364_thishost = None
2365def thishost():
2366    """Return the IP addresses of the current host."""
2367    global _thishost
2368    if _thishost is None:
2369        try:
2370            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2371        except socket.gaierror:
2372            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2373    return _thishost
2374
2375_ftperrors = None
2376def ftperrors():
2377    """Return the set of errors raised by the FTP class."""
2378    global _ftperrors
2379    if _ftperrors is None:
2380        import ftplib
2381        _ftperrors = ftplib.all_errors
2382    return _ftperrors
2383
2384_noheaders = None
2385def noheaders():
2386    """Return an empty email Message object."""
2387    global _noheaders
2388    if _noheaders is None:
2389        _noheaders = email.message_from_string("")
2390    return _noheaders
2391
2392
2393# Utility classes
2394
2395class ftpwrapper:
2396    """Class used by open_ftp() for cache of open FTP connections."""
2397
2398    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2399                 persistent=True):
2400        self.user = user
2401        self.passwd = passwd
2402        self.host = host
2403        self.port = port
2404        self.dirs = dirs
2405        self.timeout = timeout
2406        self.refcount = 0
2407        self.keepalive = persistent
2408        try:
2409            self.init()
2410        except:
2411            self.close()
2412            raise
2413
2414    def init(self):
2415        import ftplib
2416        self.busy = 0
2417        self.ftp = ftplib.FTP()
2418        self.ftp.connect(self.host, self.port, self.timeout)
2419        self.ftp.login(self.user, self.passwd)
2420        _target = '/'.join(self.dirs)
2421        self.ftp.cwd(_target)
2422
2423    def retrfile(self, file, type):
2424        import ftplib
2425        self.endtransfer()
2426        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2427        else: cmd = 'TYPE ' + type; isdir = 0
2428        try:
2429            self.ftp.voidcmd(cmd)
2430        except ftplib.all_errors:
2431            self.init()
2432            self.ftp.voidcmd(cmd)
2433        conn = None
2434        if file and not isdir:
2435            # Try to retrieve as a file
2436            try:
2437                cmd = 'RETR ' + file
2438                conn, retrlen = self.ftp.ntransfercmd(cmd)
2439            except ftplib.error_perm as reason:
2440                if str(reason)[:3] != '550':
2441                    raise URLError('ftp error: %r' % reason).with_traceback(
2442                        sys.exc_info()[2])
2443        if not conn:
2444            # Set transfer mode to ASCII!
2445            self.ftp.voidcmd('TYPE A')
2446            # Try a directory listing. Verify that directory exists.
2447            if file:
2448                pwd = self.ftp.pwd()
2449                try:
2450                    try:
2451                        self.ftp.cwd(file)
2452                    except ftplib.error_perm as reason:
2453                        raise URLError('ftp error: %r' % reason) from reason
2454                finally:
2455                    self.ftp.cwd(pwd)
2456                cmd = 'LIST ' + file
2457            else:
2458                cmd = 'LIST'
2459            conn, retrlen = self.ftp.ntransfercmd(cmd)
2460        self.busy = 1
2461
2462        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2463        self.refcount += 1
2464        conn.close()
2465        # Pass back both a suitably decorated object and a retrieval length
2466        return (ftpobj, retrlen)
2467
2468    def endtransfer(self):
2469        self.busy = 0
2470
2471    def close(self):
2472        self.keepalive = False
2473        if self.refcount <= 0:
2474            self.real_close()
2475
2476    def file_close(self):
2477        self.endtransfer()
2478        self.refcount -= 1
2479        if self.refcount <= 0 and not self.keepalive:
2480            self.real_close()
2481
2482    def real_close(self):
2483        self.endtransfer()
2484        try:
2485            self.ftp.close()
2486        except ftperrors():
2487            pass
2488
2489# Proxy handling
2490def getproxies_environment():
2491    """Return a dictionary of scheme -> proxy server URL mappings.
2492
2493    Scan the environment for variables named <scheme>_proxy;
2494    this seems to be the standard convention.  If you need a
2495    different way, you can pass a proxies dictionary to the
2496    [Fancy]URLopener constructor.
2497
2498    """
2499    proxies = {}
2500    # in order to prefer lowercase variables, process environment in
2501    # two passes: first matches any, second pass matches lowercase only
2502    for name, value in os.environ.items():
2503        name = name.lower()
2504        if value and name[-6:] == '_proxy':
2505            proxies[name[:-6]] = value
2506    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2507    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2508    # header from the client
2509    # If "proxy" is lowercase, it will still be used thanks to the next block
2510    if 'REQUEST_METHOD' in os.environ:
2511        proxies.pop('http', None)
2512    for name, value in os.environ.items():
2513        if name[-6:] == '_proxy':
2514            name = name.lower()
2515            if value:
2516                proxies[name[:-6]] = value
2517            else:
2518                proxies.pop(name[:-6], None)
2519    return proxies
2520
2521def proxy_bypass_environment(host, proxies=None):
2522    """Test if proxies should not be used for a particular host.
2523
2524    Checks the proxy dict for the value of no_proxy, which should
2525    be a list of comma separated DNS suffixes, or '*' for all hosts.
2526
2527    """
2528    if proxies is None:
2529        proxies = getproxies_environment()
2530    # don't bypass, if no_proxy isn't specified
2531    try:
2532        no_proxy = proxies['no']
2533    except KeyError:
2534        return False
2535    # '*' is special case for always bypass
2536    if no_proxy == '*':
2537        return True
2538    host = host.lower()
2539    # strip port off host
2540    hostonly, port = _splitport(host)
2541    # check if the host ends with any of the DNS suffixes
2542    for name in no_proxy.split(','):
2543        name = name.strip()
2544        if name:
2545            name = name.lstrip('.')  # ignore leading dots
2546            name = name.lower()
2547            if hostonly == name or host == name:
2548                return True
2549            name = '.' + name
2550            if hostonly.endswith(name) or host.endswith(name):
2551                return True
2552    # otherwise, don't bypass
2553    return False
2554
2555
2556# This code tests an OSX specific data structure but is testable on all
2557# platforms
2558def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2559    """
2560    Return True iff this host shouldn't be accessed using a proxy
2561
2562    This function uses the MacOSX framework SystemConfiguration
2563    to fetch the proxy information.
2564
2565    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2566    { 'exclude_simple': bool,
2567      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2568    }
2569    """
2570    from fnmatch import fnmatch
2571
2572    hostonly, port = _splitport(host)
2573
2574    def ip2num(ipAddr):
2575        parts = ipAddr.split('.')
2576        parts = list(map(int, parts))
2577        if len(parts) != 4:
2578            parts = (parts + [0, 0, 0, 0])[:4]
2579        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2580
2581    # Check for simple host names:
2582    if '.' not in host:
2583        if proxy_settings['exclude_simple']:
2584            return True
2585
2586    hostIP = None
2587
2588    for value in proxy_settings.get('exceptions', ()):
2589        # Items in the list are strings like these: *.local, 169.254/16
2590        if not value: continue
2591
2592        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2593        if m is not None:
2594            if hostIP is None:
2595                try:
2596                    hostIP = socket.gethostbyname(hostonly)
2597                    hostIP = ip2num(hostIP)
2598                except OSError:
2599                    continue
2600
2601            base = ip2num(m.group(1))
2602            mask = m.group(2)
2603            if mask is None:
2604                mask = 8 * (m.group(1).count('.') + 1)
2605            else:
2606                mask = int(mask[1:])
2607            mask = 32 - mask
2608
2609            if (hostIP >> mask) == (base >> mask):
2610                return True
2611
2612        elif fnmatch(host, value):
2613            return True
2614
2615    return False
2616
2617
2618if sys.platform == 'darwin':
2619    from _scproxy import _get_proxy_settings, _get_proxies
2620
2621    def proxy_bypass_macosx_sysconf(host):
2622        proxy_settings = _get_proxy_settings()
2623        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2624
2625    def getproxies_macosx_sysconf():
2626        """Return a dictionary of scheme -> proxy server URL mappings.
2627
2628        This function uses the MacOSX framework SystemConfiguration
2629        to fetch the proxy information.
2630        """
2631        return _get_proxies()
2632
2633
2634
2635    def proxy_bypass(host):
2636        """Return True, if host should be bypassed.
2637
2638        Checks proxy settings gathered from the environment, if specified,
2639        or from the MacOSX framework SystemConfiguration.
2640
2641        """
2642        proxies = getproxies_environment()
2643        if proxies:
2644            return proxy_bypass_environment(host, proxies)
2645        else:
2646            return proxy_bypass_macosx_sysconf(host)
2647
2648    def getproxies():
2649        return getproxies_environment() or getproxies_macosx_sysconf()
2650
2651
2652elif os.name == 'nt':
2653    def getproxies_registry():
2654        """Return a dictionary of scheme -> proxy server URL mappings.
2655
2656        Win32 uses the registry to store proxies.
2657
2658        """
2659        proxies = {}
2660        try:
2661            import winreg
2662        except ImportError:
2663            # Std module, so should be around - but you never know!
2664            return proxies
2665        try:
2666            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2667                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2668            proxyEnable = winreg.QueryValueEx(internetSettings,
2669                                               'ProxyEnable')[0]
2670            if proxyEnable:
2671                # Returned as Unicode but problems if not converted to ASCII
2672                proxyServer = str(winreg.QueryValueEx(internetSettings,
2673                                                       'ProxyServer')[0])
2674                if '=' in proxyServer:
2675                    # Per-protocol settings
2676                    for p in proxyServer.split(';'):
2677                        protocol, address = p.split('=', 1)
2678                        # See if address has a type:// prefix
2679                        if not re.match('(?:[^/:]+)://', address):
2680                            address = '%s://%s' % (protocol, address)
2681                        proxies[protocol] = address
2682                else:
2683                    # Use one setting for all protocols
2684                    if proxyServer[:5] == 'http:':
2685                        proxies['http'] = proxyServer
2686                    else:
2687                        proxies['http'] = 'http://%s' % proxyServer
2688                        proxies['https'] = 'https://%s' % proxyServer
2689                        proxies['ftp'] = 'ftp://%s' % proxyServer
2690            internetSettings.Close()
2691        except (OSError, ValueError, TypeError):
2692            # Either registry key not found etc, or the value in an
2693            # unexpected format.
2694            # proxies already set up to be empty so nothing to do
2695            pass
2696        return proxies
2697
2698    def getproxies():
2699        """Return a dictionary of scheme -> proxy server URL mappings.
2700
2701        Returns settings gathered from the environment, if specified,
2702        or the registry.
2703
2704        """
2705        return getproxies_environment() or getproxies_registry()
2706
2707    def proxy_bypass_registry(host):
2708        try:
2709            import winreg
2710        except ImportError:
2711            # Std modules, so should be around - but you never know!
2712            return 0
2713        try:
2714            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2715                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2716            proxyEnable = winreg.QueryValueEx(internetSettings,
2717                                               'ProxyEnable')[0]
2718            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2719                                                     'ProxyOverride')[0])
2720            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2721        except OSError:
2722            return 0
2723        if not proxyEnable or not proxyOverride:
2724            return 0
2725        # try to make a host list from name and IP address.
2726        rawHost, port = _splitport(host)
2727        host = [rawHost]
2728        try:
2729            addr = socket.gethostbyname(rawHost)
2730            if addr != rawHost:
2731                host.append(addr)
2732        except OSError:
2733            pass
2734        try:
2735            fqdn = socket.getfqdn(rawHost)
2736            if fqdn != rawHost:
2737                host.append(fqdn)
2738        except OSError:
2739            pass
2740        # make a check value list from the registry entry: replace the
2741        # '<local>' string by the localhost entry and the corresponding
2742        # canonical entry.
2743        proxyOverride = proxyOverride.split(';')
2744        # now check if we match one of the registry values.
2745        for test in proxyOverride:
2746            if test == '<local>':
2747                if '.' not in rawHost:
2748                    return 1
2749            test = test.replace(".", r"\.")     # mask dots
2750            test = test.replace("*", r".*")     # change glob sequence
2751            test = test.replace("?", r".")      # change glob char
2752            for val in host:
2753                if re.match(test, val, re.I):
2754                    return 1
2755        return 0
2756
2757    def proxy_bypass(host):
2758        """Return True, if host should be bypassed.
2759
2760        Checks proxy settings gathered from the environment, if specified,
2761        or the registry.
2762
2763        """
2764        proxies = getproxies_environment()
2765        if proxies:
2766            return proxy_bypass_environment(host, proxies)
2767        else:
2768            return proxy_bypass_registry(host)
2769
2770else:
2771    # By default use environment variables
2772    getproxies = getproxies_environment
2773    proxy_bypass = proxy_bypass_environment
2774