• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import collections
98import tempfile
99import contextlib
100import warnings
101
102
103from urllib.error import URLError, HTTPError, ContentTooShortError
104from urllib.parse import (
105    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
106    splittype, splithost, splitport, splituser, splitpasswd,
107    splitattr, splitquery, splitvalue, splittag, to_bytes,
108    unquote_to_bytes, urlunparse)
109from urllib.response import addinfourl, addclosehook
110
111# check for SSL
112try:
113    import ssl
114except ImportError:
115    _have_ssl = False
116else:
117    _have_ssl = True
118
119__all__ = [
120    # Classes
121    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
122    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
123    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
124    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
125    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
126    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
127    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
128    'UnknownHandler', 'HTTPErrorProcessor',
129    # Functions
130    'urlopen', 'install_opener', 'build_opener',
131    'pathname2url', 'url2pathname', 'getproxies',
132    # Legacy interface
133    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
134]
135
136# used in User-Agent header sent
137__version__ = '%d.%d' % sys.version_info[:2]
138
139_opener = None
140def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
141            *, cafile=None, capath=None, cadefault=False, context=None):
142    '''Open the URL url, which can be either a string or a Request object.
143
144    *data* must be an object specifying additional data to be sent to
145    the server, or None if no such data is needed.  See Request for
146    details.
147
148    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
149    header in its HTTP requests.
150
151    The optional *timeout* parameter specifies a timeout in seconds for
152    blocking operations like the connection attempt (if not specified, the
153    global default timeout setting will be used). This only works for HTTP,
154    HTTPS and FTP connections.
155
156    If *context* is specified, it must be a ssl.SSLContext instance describing
157    the various SSL options. See HTTPSConnection for more details.
158
159    The optional *cafile* and *capath* parameters specify a set of trusted CA
160    certificates for HTTPS requests. cafile should point to a single file
161    containing a bundle of CA certificates, whereas capath should point to a
162    directory of hashed certificate files. More information can be found in
163    ssl.SSLContext.load_verify_locations().
164
165    The *cadefault* parameter is ignored.
166
167    This function always returns an object which can work as a context
168    manager and has methods such as
169
170    * geturl() - return the URL of the resource retrieved, commonly used to
171      determine if a redirect was followed
172
173    * info() - return the meta-information of the page, such as headers, in the
174      form of an email.message_from_string() instance (see Quick Reference to
175      HTTP Headers)
176
177    * getcode() - return the HTTP status code of the response.  Raises URLError
178      on errors.
179
180    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
181    object slightly modified. In addition to the three new methods above, the
182    msg attribute contains the same information as the reason attribute ---
183    the reason phrase returned by the server --- instead of the response
184    headers as it is specified in the documentation for HTTPResponse.
185
186    For FTP, file, and data URLs and requests explicitly handled by legacy
187    URLopener and FancyURLopener classes, this function returns a
188    urllib.response.addinfourl object.
189
190    Note that None may be returned if no handler handles the request (though
191    the default installed global OpenerDirector uses UnknownHandler to ensure
192    this never happens).
193
194    In addition, if proxy settings are detected (for example, when a *_proxy
195    environment variable like http_proxy is set), ProxyHandler is default
196    installed and makes sure the requests are handled through the proxy.
197
198    '''
199    global _opener
200    if cafile or capath or cadefault:
201        import warnings
202        warnings.warn("cafile, cpath and cadefault are deprecated, use a "
203                      "custom context instead.", DeprecationWarning, 2)
204        if context is not None:
205            raise ValueError(
206                "You can't pass both context and any of cafile, capath, and "
207                "cadefault"
208            )
209        if not _have_ssl:
210            raise ValueError('SSL support not available')
211        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
212                                             cafile=cafile,
213                                             capath=capath)
214        https_handler = HTTPSHandler(context=context)
215        opener = build_opener(https_handler)
216    elif context:
217        https_handler = HTTPSHandler(context=context)
218        opener = build_opener(https_handler)
219    elif _opener is None:
220        _opener = opener = build_opener()
221    else:
222        opener = _opener
223    return opener.open(url, data, timeout)
224
225def install_opener(opener):
226    global _opener
227    _opener = opener
228
229_url_tempfiles = []
230def urlretrieve(url, filename=None, reporthook=None, data=None):
231    """
232    Retrieve a URL into a temporary location on disk.
233
234    Requires a URL argument. If a filename is passed, it is used as
235    the temporary file location. The reporthook argument should be
236    a callable that accepts a block number, a read size, and the
237    total file size of the URL target. The data argument should be
238    valid URL encoded data.
239
240    If a filename is passed and the URL points to a local resource,
241    the result is a copy from local file to new file.
242
243    Returns a tuple containing the path to the newly created
244    data file as well as the resulting HTTPMessage object.
245    """
246    url_type, path = splittype(url)
247
248    with contextlib.closing(urlopen(url, data)) as fp:
249        headers = fp.info()
250
251        # Just return the local path and the "headers" for file://
252        # URLs. No sense in performing a copy unless requested.
253        if url_type == "file" and not filename:
254            return os.path.normpath(path), headers
255
256        # Handle temporary file setup.
257        if filename:
258            tfp = open(filename, 'wb')
259        else:
260            tfp = tempfile.NamedTemporaryFile(delete=False)
261            filename = tfp.name
262            _url_tempfiles.append(filename)
263
264        with tfp:
265            result = filename, headers
266            bs = 1024*8
267            size = -1
268            read = 0
269            blocknum = 0
270            if "content-length" in headers:
271                size = int(headers["Content-Length"])
272
273            if reporthook:
274                reporthook(blocknum, bs, size)
275
276            while True:
277                block = fp.read(bs)
278                if not block:
279                    break
280                read += len(block)
281                tfp.write(block)
282                blocknum += 1
283                if reporthook:
284                    reporthook(blocknum, bs, size)
285
286    if size >= 0 and read < size:
287        raise ContentTooShortError(
288            "retrieval incomplete: got only %i out of %i bytes"
289            % (read, size), result)
290
291    return result
292
293def urlcleanup():
294    """Clean up temporary files from urlretrieve calls."""
295    for temp_file in _url_tempfiles:
296        try:
297            os.unlink(temp_file)
298        except OSError:
299            pass
300
301    del _url_tempfiles[:]
302    global _opener
303    if _opener:
304        _opener = None
305
306# copied from cookielib.py
307_cut_port_re = re.compile(r":\d+$", re.ASCII)
308def request_host(request):
309    """Return request-host, as defined by RFC 2965.
310
311    Variation from RFC: returned value is lowercased, for convenient
312    comparison.
313
314    """
315    url = request.full_url
316    host = urlparse(url)[1]
317    if host == "":
318        host = request.get_header("Host", "")
319
320    # remove port, if present
321    host = _cut_port_re.sub("", host, 1)
322    return host.lower()
323
324class Request:
325
326    def __init__(self, url, data=None, headers={},
327                 origin_req_host=None, unverifiable=False,
328                 method=None):
329        self.full_url = url
330        self.headers = {}
331        self.unredirected_hdrs = {}
332        self._data = None
333        self.data = data
334        self._tunnel_host = None
335        for key, value in headers.items():
336            self.add_header(key, value)
337        if origin_req_host is None:
338            origin_req_host = request_host(self)
339        self.origin_req_host = origin_req_host
340        self.unverifiable = unverifiable
341        if method:
342            self.method = method
343
344    @property
345    def full_url(self):
346        if self.fragment:
347            return '{}#{}'.format(self._full_url, self.fragment)
348        return self._full_url
349
350    @full_url.setter
351    def full_url(self, url):
352        # unwrap('<URL:type://host/path>') --> 'type://host/path'
353        self._full_url = unwrap(url)
354        self._full_url, self.fragment = splittag(self._full_url)
355        self._parse()
356
357    @full_url.deleter
358    def full_url(self):
359        self._full_url = None
360        self.fragment = None
361        self.selector = ''
362
363    @property
364    def data(self):
365        return self._data
366
367    @data.setter
368    def data(self, data):
369        if data != self._data:
370            self._data = data
371            # issue 16464
372            # if we change data we need to remove content-length header
373            # (cause it's most probably calculated for previous value)
374            if self.has_header("Content-length"):
375                self.remove_header("Content-length")
376
377    @data.deleter
378    def data(self):
379        self.data = None
380
381    def _parse(self):
382        self.type, rest = splittype(self._full_url)
383        if self.type is None:
384            raise ValueError("unknown url type: %r" % self.full_url)
385        self.host, self.selector = splithost(rest)
386        if self.host:
387            self.host = unquote(self.host)
388
389    def get_method(self):
390        """Return a string indicating the HTTP request method."""
391        default_method = "POST" if self.data is not None else "GET"
392        return getattr(self, 'method', default_method)
393
394    def get_full_url(self):
395        return self.full_url
396
397    def set_proxy(self, host, type):
398        if self.type == 'https' and not self._tunnel_host:
399            self._tunnel_host = self.host
400        else:
401            self.type= type
402            self.selector = self.full_url
403        self.host = host
404
405    def has_proxy(self):
406        return self.selector == self.full_url
407
408    def add_header(self, key, val):
409        # useful for something like authentication
410        self.headers[key.capitalize()] = val
411
412    def add_unredirected_header(self, key, val):
413        # will not be added to a redirected request
414        self.unredirected_hdrs[key.capitalize()] = val
415
416    def has_header(self, header_name):
417        return (header_name in self.headers or
418                header_name in self.unredirected_hdrs)
419
420    def get_header(self, header_name, default=None):
421        return self.headers.get(
422            header_name,
423            self.unredirected_hdrs.get(header_name, default))
424
425    def remove_header(self, header_name):
426        self.headers.pop(header_name, None)
427        self.unredirected_hdrs.pop(header_name, None)
428
429    def header_items(self):
430        hdrs = self.unredirected_hdrs.copy()
431        hdrs.update(self.headers)
432        return list(hdrs.items())
433
434class OpenerDirector:
435    def __init__(self):
436        client_version = "Python-urllib/%s" % __version__
437        self.addheaders = [('User-agent', client_version)]
438        # self.handlers is retained only for backward compatibility
439        self.handlers = []
440        # manage the individual handlers
441        self.handle_open = {}
442        self.handle_error = {}
443        self.process_response = {}
444        self.process_request = {}
445
446    def add_handler(self, handler):
447        if not hasattr(handler, "add_parent"):
448            raise TypeError("expected BaseHandler instance, got %r" %
449                            type(handler))
450
451        added = False
452        for meth in dir(handler):
453            if meth in ["redirect_request", "do_open", "proxy_open"]:
454                # oops, coincidental match
455                continue
456
457            i = meth.find("_")
458            protocol = meth[:i]
459            condition = meth[i+1:]
460
461            if condition.startswith("error"):
462                j = condition.find("_") + i + 1
463                kind = meth[j+1:]
464                try:
465                    kind = int(kind)
466                except ValueError:
467                    pass
468                lookup = self.handle_error.get(protocol, {})
469                self.handle_error[protocol] = lookup
470            elif condition == "open":
471                kind = protocol
472                lookup = self.handle_open
473            elif condition == "response":
474                kind = protocol
475                lookup = self.process_response
476            elif condition == "request":
477                kind = protocol
478                lookup = self.process_request
479            else:
480                continue
481
482            handlers = lookup.setdefault(kind, [])
483            if handlers:
484                bisect.insort(handlers, handler)
485            else:
486                handlers.append(handler)
487            added = True
488
489        if added:
490            bisect.insort(self.handlers, handler)
491            handler.add_parent(self)
492
493    def close(self):
494        # Only exists for backwards compatibility.
495        pass
496
497    def _call_chain(self, chain, kind, meth_name, *args):
498        # Handlers raise an exception if no one else should try to handle
499        # the request, or return None if they can't but another handler
500        # could.  Otherwise, they return the response.
501        handlers = chain.get(kind, ())
502        for handler in handlers:
503            func = getattr(handler, meth_name)
504            result = func(*args)
505            if result is not None:
506                return result
507
508    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
509        # accept a URL or a Request object
510        if isinstance(fullurl, str):
511            req = Request(fullurl, data)
512        else:
513            req = fullurl
514            if data is not None:
515                req.data = data
516
517        req.timeout = timeout
518        protocol = req.type
519
520        # pre-process request
521        meth_name = protocol+"_request"
522        for processor in self.process_request.get(protocol, []):
523            meth = getattr(processor, meth_name)
524            req = meth(req)
525
526        response = self._open(req, data)
527
528        # post-process response
529        meth_name = protocol+"_response"
530        for processor in self.process_response.get(protocol, []):
531            meth = getattr(processor, meth_name)
532            response = meth(req, response)
533
534        return response
535
536    def _open(self, req, data=None):
537        result = self._call_chain(self.handle_open, 'default',
538                                  'default_open', req)
539        if result:
540            return result
541
542        protocol = req.type
543        result = self._call_chain(self.handle_open, protocol, protocol +
544                                  '_open', req)
545        if result:
546            return result
547
548        return self._call_chain(self.handle_open, 'unknown',
549                                'unknown_open', req)
550
551    def error(self, proto, *args):
552        if proto in ('http', 'https'):
553            # XXX http[s] protocols are special-cased
554            dict = self.handle_error['http'] # https is not different than http
555            proto = args[2]  # YUCK!
556            meth_name = 'http_error_%s' % proto
557            http_err = 1
558            orig_args = args
559        else:
560            dict = self.handle_error
561            meth_name = proto + '_error'
562            http_err = 0
563        args = (dict, proto, meth_name) + args
564        result = self._call_chain(*args)
565        if result:
566            return result
567
568        if http_err:
569            args = (dict, 'default', 'http_error_default') + orig_args
570            return self._call_chain(*args)
571
572# XXX probably also want an abstract factory that knows when it makes
573# sense to skip a superclass in favor of a subclass and when it might
574# make sense to include both
575
576def build_opener(*handlers):
577    """Create an opener object from a list of handlers.
578
579    The opener will use several default handlers, including support
580    for HTTP, FTP and when applicable HTTPS.
581
582    If any of the handlers passed as arguments are subclasses of the
583    default handlers, the default handlers will not be used.
584    """
585    opener = OpenerDirector()
586    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
587                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
588                       FTPHandler, FileHandler, HTTPErrorProcessor,
589                       DataHandler]
590    if hasattr(http.client, "HTTPSConnection"):
591        default_classes.append(HTTPSHandler)
592    skip = set()
593    for klass in default_classes:
594        for check in handlers:
595            if isinstance(check, type):
596                if issubclass(check, klass):
597                    skip.add(klass)
598            elif isinstance(check, klass):
599                skip.add(klass)
600    for klass in skip:
601        default_classes.remove(klass)
602
603    for klass in default_classes:
604        opener.add_handler(klass())
605
606    for h in handlers:
607        if isinstance(h, type):
608            h = h()
609        opener.add_handler(h)
610    return opener
611
612class BaseHandler:
613    handler_order = 500
614
615    def add_parent(self, parent):
616        self.parent = parent
617
618    def close(self):
619        # Only exists for backwards compatibility
620        pass
621
622    def __lt__(self, other):
623        if not hasattr(other, "handler_order"):
624            # Try to preserve the old behavior of having custom classes
625            # inserted after default ones (works only for custom user
626            # classes which are not aware of handler_order).
627            return True
628        return self.handler_order < other.handler_order
629
630
631class HTTPErrorProcessor(BaseHandler):
632    """Process HTTP error responses."""
633    handler_order = 1000  # after all other processing
634
635    def http_response(self, request, response):
636        code, msg, hdrs = response.code, response.msg, response.info()
637
638        # According to RFC 2616, "2xx" code indicates that the client's
639        # request was successfully received, understood, and accepted.
640        if not (200 <= code < 300):
641            response = self.parent.error(
642                'http', request, response, code, msg, hdrs)
643
644        return response
645
646    https_response = http_response
647
648class HTTPDefaultErrorHandler(BaseHandler):
649    def http_error_default(self, req, fp, code, msg, hdrs):
650        raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652class HTTPRedirectHandler(BaseHandler):
653    # maximum number of redirections to any single URL
654    # this is needed because of the state that cookies introduce
655    max_repeats = 4
656    # maximum total number of redirections (regardless of URL) before
657    # assuming we're in a loop
658    max_redirections = 10
659
660    def redirect_request(self, req, fp, code, msg, headers, newurl):
661        """Return a Request or None in response to a redirect.
662
663        This is called by the http_error_30x methods when a
664        redirection response is received.  If a redirection should
665        take place, return a new Request to allow http_error_30x to
666        perform the redirect.  Otherwise, raise HTTPError if no-one
667        else should try to handle this url.  Return None if you can't
668        but another Handler might.
669        """
670        m = req.get_method()
671        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
672            or code in (301, 302, 303) and m == "POST")):
673            raise HTTPError(req.full_url, code, msg, headers, fp)
674
675        # Strictly (according to RFC 2616), 301 or 302 in response to
676        # a POST MUST NOT cause a redirection without confirmation
677        # from the user (of urllib.request, in this case).  In practice,
678        # essentially all clients do redirect in this case, so we do
679        # the same.
680
681        # Be conciliant with URIs containing a space.  This is mainly
682        # redundant with the more complete encoding done in http_error_302(),
683        # but it is kept for compatibility with other callers.
684        newurl = newurl.replace(' ', '%20')
685
686        CONTENT_HEADERS = ("content-length", "content-type")
687        newheaders = dict((k, v) for k, v in req.headers.items()
688                          if k.lower() not in CONTENT_HEADERS)
689        return Request(newurl,
690                       headers=newheaders,
691                       origin_req_host=req.origin_req_host,
692                       unverifiable=True)
693
694    # Implementation note: To avoid the server sending us into an
695    # infinite loop, the request object needs to track what URLs we
696    # have already seen.  Do this by adding a handler-specific
697    # attribute to the Request object.
698    def http_error_302(self, req, fp, code, msg, headers):
699        # Some servers (incorrectly) return multiple Location headers
700        # (so probably same goes for URI).  Use first header.
701        if "location" in headers:
702            newurl = headers["location"]
703        elif "uri" in headers:
704            newurl = headers["uri"]
705        else:
706            return
707
708        # fix a possible malformed URL
709        urlparts = urlparse(newurl)
710
711        # For security reasons we don't allow redirection to anything other
712        # than http, https or ftp.
713
714        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
715            raise HTTPError(
716                newurl, code,
717                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
718                headers, fp)
719
720        if not urlparts.path and urlparts.netloc:
721            urlparts = list(urlparts)
722            urlparts[2] = "/"
723        newurl = urlunparse(urlparts)
724
725        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
726        # original bytes and percent-encode non-ASCII bytes, and any special
727        # characters such as the space.
728        newurl = quote(
729            newurl, encoding="iso-8859-1", safe=string.punctuation)
730        newurl = urljoin(req.full_url, newurl)
731
732        # XXX Probably want to forget about the state of the current
733        # request, although that might interact poorly with other
734        # handlers that also use handler-specific request attributes
735        new = self.redirect_request(req, fp, code, msg, headers, newurl)
736        if new is None:
737            return
738
739        # loop detection
740        # .redirect_dict has a key url if url was previously visited.
741        if hasattr(req, 'redirect_dict'):
742            visited = new.redirect_dict = req.redirect_dict
743            if (visited.get(newurl, 0) >= self.max_repeats or
744                len(visited) >= self.max_redirections):
745                raise HTTPError(req.full_url, code,
746                                self.inf_msg + msg, headers, fp)
747        else:
748            visited = new.redirect_dict = req.redirect_dict = {}
749        visited[newurl] = visited.get(newurl, 0) + 1
750
751        # Don't close the fp until we are sure that we won't use it
752        # with HTTPError.
753        fp.read()
754        fp.close()
755
756        return self.parent.open(new, timeout=req.timeout)
757
758    http_error_301 = http_error_303 = http_error_307 = http_error_302
759
760    inf_msg = "The HTTP server returned a redirect error that would " \
761              "lead to an infinite loop.\n" \
762              "The last 30x error message was:\n"
763
764
765def _parse_proxy(proxy):
766    """Return (scheme, user, password, host/port) given a URL or an authority.
767
768    If a URL is supplied, it must have an authority (host:port) component.
769    According to RFC 3986, having an authority component means the URL must
770    have two slashes after the scheme.
771    """
772    scheme, r_scheme = splittype(proxy)
773    if not r_scheme.startswith("/"):
774        # authority
775        scheme = None
776        authority = proxy
777    else:
778        # URL
779        if not r_scheme.startswith("//"):
780            raise ValueError("proxy URL with no authority: %r" % proxy)
781        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
782        # and 3.3.), path is empty or starts with '/'
783        end = r_scheme.find("/", 2)
784        if end == -1:
785            end = None
786        authority = r_scheme[2:end]
787    userinfo, hostport = splituser(authority)
788    if userinfo is not None:
789        user, password = splitpasswd(userinfo)
790    else:
791        user = password = None
792    return scheme, user, password, hostport
793
794class ProxyHandler(BaseHandler):
795    # Proxies must be in front
796    handler_order = 100
797
798    def __init__(self, proxies=None):
799        if proxies is None:
800            proxies = getproxies()
801        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
802        self.proxies = proxies
803        for type, url in proxies.items():
804            setattr(self, '%s_open' % type,
805                    lambda r, proxy=url, type=type, meth=self.proxy_open:
806                        meth(r, proxy, type))
807
808    def proxy_open(self, req, proxy, type):
809        orig_type = req.type
810        proxy_type, user, password, hostport = _parse_proxy(proxy)
811        if proxy_type is None:
812            proxy_type = orig_type
813
814        if req.host and proxy_bypass(req.host):
815            return None
816
817        if user and password:
818            user_pass = '%s:%s' % (unquote(user),
819                                   unquote(password))
820            creds = base64.b64encode(user_pass.encode()).decode("ascii")
821            req.add_header('Proxy-authorization', 'Basic ' + creds)
822        hostport = unquote(hostport)
823        req.set_proxy(hostport, proxy_type)
824        if orig_type == proxy_type or orig_type == 'https':
825            # let other handlers take care of it
826            return None
827        else:
828            # need to start over, because the other handlers don't
829            # grok the proxy's URL type
830            # e.g. if we have a constructor arg proxies like so:
831            # {'http': 'ftp://proxy.example.com'}, we may end up turning
832            # a request for http://acme.example.com/a into one for
833            # ftp://proxy.example.com/a
834            return self.parent.open(req, timeout=req.timeout)
835
836class HTTPPasswordMgr:
837
838    def __init__(self):
839        self.passwd = {}
840
841    def add_password(self, realm, uri, user, passwd):
842        # uri could be a single URI or a sequence
843        if isinstance(uri, str):
844            uri = [uri]
845        if realm not in self.passwd:
846            self.passwd[realm] = {}
847        for default_port in True, False:
848            reduced_uri = tuple(
849                [self.reduce_uri(u, default_port) for u in uri])
850            self.passwd[realm][reduced_uri] = (user, passwd)
851
852    def find_user_password(self, realm, authuri):
853        domains = self.passwd.get(realm, {})
854        for default_port in True, False:
855            reduced_authuri = self.reduce_uri(authuri, default_port)
856            for uris, authinfo in domains.items():
857                for uri in uris:
858                    if self.is_suburi(uri, reduced_authuri):
859                        return authinfo
860        return None, None
861
862    def reduce_uri(self, uri, default_port=True):
863        """Accept authority or URI and extract only the authority and path."""
864        # note HTTP URLs do not have a userinfo component
865        parts = urlsplit(uri)
866        if parts[1]:
867            # URI
868            scheme = parts[0]
869            authority = parts[1]
870            path = parts[2] or '/'
871        else:
872            # host or host:port
873            scheme = None
874            authority = uri
875            path = '/'
876        host, port = splitport(authority)
877        if default_port and port is None and scheme is not None:
878            dport = {"http": 80,
879                     "https": 443,
880                     }.get(scheme)
881            if dport is not None:
882                authority = "%s:%d" % (host, dport)
883        return authority, path
884
885    def is_suburi(self, base, test):
886        """Check if test is below base in a URI tree
887
888        Both args must be URIs in reduced form.
889        """
890        if base == test:
891            return True
892        if base[0] != test[0]:
893            return False
894        common = posixpath.commonprefix((base[1], test[1]))
895        if len(common) == len(base[1]):
896            return True
897        return False
898
899
900class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
901
902    def find_user_password(self, realm, authuri):
903        user, password = HTTPPasswordMgr.find_user_password(self, realm,
904                                                            authuri)
905        if user is not None:
906            return user, password
907        return HTTPPasswordMgr.find_user_password(self, None, authuri)
908
909
910class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
911
912    def __init__(self, *args, **kwargs):
913        self.authenticated = {}
914        super().__init__(*args, **kwargs)
915
916    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
917        self.update_authenticated(uri, is_authenticated)
918        # Add a default for prior auth requests
919        if realm is not None:
920            super().add_password(None, uri, user, passwd)
921        super().add_password(realm, uri, user, passwd)
922
923    def update_authenticated(self, uri, is_authenticated=False):
924        # uri could be a single URI or a sequence
925        if isinstance(uri, str):
926            uri = [uri]
927
928        for default_port in True, False:
929            for u in uri:
930                reduced_uri = self.reduce_uri(u, default_port)
931                self.authenticated[reduced_uri] = is_authenticated
932
933    def is_authenticated(self, authuri):
934        for default_port in True, False:
935            reduced_authuri = self.reduce_uri(authuri, default_port)
936            for uri in self.authenticated:
937                if self.is_suburi(uri, reduced_authuri):
938                    return self.authenticated[uri]
939
940
941class AbstractBasicAuthHandler:
942
943    # XXX this allows for multiple auth-schemes, but will stupidly pick
944    # the last one with a realm specified.
945
946    # allow for double- and single-quoted realm values
947    # (single quotes are a violation of the RFC, but appear in the wild)
948    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
949                    'realm=(["\']?)([^"\']*)\\2', re.I)
950
951    # XXX could pre-emptively send auth info already accepted (RFC 2617,
952    # end of section 2, and section 1.2 immediately after "credentials"
953    # production).
954
955    def __init__(self, password_mgr=None):
956        if password_mgr is None:
957            password_mgr = HTTPPasswordMgr()
958        self.passwd = password_mgr
959        self.add_password = self.passwd.add_password
960
961    def http_error_auth_reqed(self, authreq, host, req, headers):
962        # host may be an authority (without userinfo) or a URL with an
963        # authority
964        # XXX could be multiple headers
965        authreq = headers.get(authreq, None)
966
967        if authreq:
968            scheme = authreq.split()[0]
969            if scheme.lower() != 'basic':
970                raise ValueError("AbstractBasicAuthHandler does not"
971                                 " support the following scheme: '%s'" %
972                                 scheme)
973            else:
974                mo = AbstractBasicAuthHandler.rx.search(authreq)
975                if mo:
976                    scheme, quote, realm = mo.groups()
977                    if quote not in ['"',"'"]:
978                        warnings.warn("Basic Auth Realm was unquoted",
979                                      UserWarning, 2)
980                    if scheme.lower() == 'basic':
981                        return self.retry_http_basic_auth(host, req, realm)
982
983    def retry_http_basic_auth(self, host, req, realm):
984        user, pw = self.passwd.find_user_password(realm, host)
985        if pw is not None:
986            raw = "%s:%s" % (user, pw)
987            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
988            if req.get_header(self.auth_header, None) == auth:
989                return None
990            req.add_unredirected_header(self.auth_header, auth)
991            return self.parent.open(req, timeout=req.timeout)
992        else:
993            return None
994
995    def http_request(self, req):
996        if (not hasattr(self.passwd, 'is_authenticated') or
997           not self.passwd.is_authenticated(req.full_url)):
998            return req
999
1000        if not req.has_header('Authorization'):
1001            user, passwd = self.passwd.find_user_password(None, req.full_url)
1002            credentials = '{0}:{1}'.format(user, passwd).encode()
1003            auth_str = base64.standard_b64encode(credentials).decode()
1004            req.add_unredirected_header('Authorization',
1005                                        'Basic {}'.format(auth_str.strip()))
1006        return req
1007
1008    def http_response(self, req, response):
1009        if hasattr(self.passwd, 'is_authenticated'):
1010            if 200 <= response.code < 300:
1011                self.passwd.update_authenticated(req.full_url, True)
1012            else:
1013                self.passwd.update_authenticated(req.full_url, False)
1014        return response
1015
1016    https_request = http_request
1017    https_response = http_response
1018
1019
1020
1021class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1022
1023    auth_header = 'Authorization'
1024
1025    def http_error_401(self, req, fp, code, msg, headers):
1026        url = req.full_url
1027        response = self.http_error_auth_reqed('www-authenticate',
1028                                          url, req, headers)
1029        return response
1030
1031
1032class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1033
1034    auth_header = 'Proxy-authorization'
1035
1036    def http_error_407(self, req, fp, code, msg, headers):
1037        # http_error_auth_reqed requires that there is no userinfo component in
1038        # authority.  Assume there isn't one, since urllib.request does not (and
1039        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1040        # userinfo.
1041        authority = req.host
1042        response = self.http_error_auth_reqed('proxy-authenticate',
1043                                          authority, req, headers)
1044        return response
1045
1046
1047# Return n random bytes.
1048_randombytes = os.urandom
1049
1050
1051class AbstractDigestAuthHandler:
1052    # Digest authentication is specified in RFC 2617.
1053
1054    # XXX The client does not inspect the Authentication-Info header
1055    # in a successful response.
1056
1057    # XXX It should be possible to test this implementation against
1058    # a mock server that just generates a static set of challenges.
1059
1060    # XXX qop="auth-int" supports is shaky
1061
1062    def __init__(self, passwd=None):
1063        if passwd is None:
1064            passwd = HTTPPasswordMgr()
1065        self.passwd = passwd
1066        self.add_password = self.passwd.add_password
1067        self.retried = 0
1068        self.nonce_count = 0
1069        self.last_nonce = None
1070
1071    def reset_retry_count(self):
1072        self.retried = 0
1073
1074    def http_error_auth_reqed(self, auth_header, host, req, headers):
1075        authreq = headers.get(auth_header, None)
1076        if self.retried > 5:
1077            # Don't fail endlessly - if we failed once, we'll probably
1078            # fail a second time. Hm. Unless the Password Manager is
1079            # prompting for the information. Crap. This isn't great
1080            # but it's better than the current 'repeat until recursion
1081            # depth exceeded' approach <wink>
1082            raise HTTPError(req.full_url, 401, "digest auth failed",
1083                            headers, None)
1084        else:
1085            self.retried += 1
1086        if authreq:
1087            scheme = authreq.split()[0]
1088            if scheme.lower() == 'digest':
1089                return self.retry_http_digest_auth(req, authreq)
1090            elif scheme.lower() != 'basic':
1091                raise ValueError("AbstractDigestAuthHandler does not support"
1092                                 " the following scheme: '%s'" % scheme)
1093
1094    def retry_http_digest_auth(self, req, auth):
1095        token, challenge = auth.split(' ', 1)
1096        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1097        auth = self.get_authorization(req, chal)
1098        if auth:
1099            auth_val = 'Digest %s' % auth
1100            if req.headers.get(self.auth_header, None) == auth_val:
1101                return None
1102            req.add_unredirected_header(self.auth_header, auth_val)
1103            resp = self.parent.open(req, timeout=req.timeout)
1104            return resp
1105
1106    def get_cnonce(self, nonce):
1107        # The cnonce-value is an opaque
1108        # quoted string value provided by the client and used by both client
1109        # and server to avoid chosen plaintext attacks, to provide mutual
1110        # authentication, and to provide some message integrity protection.
1111        # This isn't a fabulous effort, but it's probably Good Enough.
1112        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1113        b = s.encode("ascii") + _randombytes(8)
1114        dig = hashlib.sha1(b).hexdigest()
1115        return dig[:16]
1116
1117    def get_authorization(self, req, chal):
1118        try:
1119            realm = chal['realm']
1120            nonce = chal['nonce']
1121            qop = chal.get('qop')
1122            algorithm = chal.get('algorithm', 'MD5')
1123            # mod_digest doesn't send an opaque, even though it isn't
1124            # supposed to be optional
1125            opaque = chal.get('opaque', None)
1126        except KeyError:
1127            return None
1128
1129        H, KD = self.get_algorithm_impls(algorithm)
1130        if H is None:
1131            return None
1132
1133        user, pw = self.passwd.find_user_password(realm, req.full_url)
1134        if user is None:
1135            return None
1136
1137        # XXX not implemented yet
1138        if req.data is not None:
1139            entdig = self.get_entity_digest(req.data, chal)
1140        else:
1141            entdig = None
1142
1143        A1 = "%s:%s:%s" % (user, realm, pw)
1144        A2 = "%s:%s" % (req.get_method(),
1145                        # XXX selector: what about proxies and full urls
1146                        req.selector)
1147        if qop == 'auth':
1148            if nonce == self.last_nonce:
1149                self.nonce_count += 1
1150            else:
1151                self.nonce_count = 1
1152                self.last_nonce = nonce
1153            ncvalue = '%08x' % self.nonce_count
1154            cnonce = self.get_cnonce(nonce)
1155            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1156            respdig = KD(H(A1), noncebit)
1157        elif qop is None:
1158            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1159        else:
1160            # XXX handle auth-int.
1161            raise URLError("qop '%s' is not supported." % qop)
1162
1163        # XXX should the partial digests be encoded too?
1164
1165        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1166               'response="%s"' % (user, realm, nonce, req.selector,
1167                                  respdig)
1168        if opaque:
1169            base += ', opaque="%s"' % opaque
1170        if entdig:
1171            base += ', digest="%s"' % entdig
1172        base += ', algorithm="%s"' % algorithm
1173        if qop:
1174            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1175        return base
1176
1177    def get_algorithm_impls(self, algorithm):
1178        # lambdas assume digest modules are imported at the top level
1179        if algorithm == 'MD5':
1180            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1181        elif algorithm == 'SHA':
1182            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1183        # XXX MD5-sess
1184        else:
1185            raise ValueError("Unsupported digest authentication "
1186                             "algorithm %r" % algorithm)
1187        KD = lambda s, d: H("%s:%s" % (s, d))
1188        return H, KD
1189
1190    def get_entity_digest(self, data, chal):
1191        # XXX not implemented yet
1192        return None
1193
1194
1195class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1196    """An authentication protocol defined by RFC 2069
1197
1198    Digest authentication improves on basic authentication because it
1199    does not transmit passwords in the clear.
1200    """
1201
1202    auth_header = 'Authorization'
1203    handler_order = 490  # before Basic auth
1204
1205    def http_error_401(self, req, fp, code, msg, headers):
1206        host = urlparse(req.full_url)[1]
1207        retry = self.http_error_auth_reqed('www-authenticate',
1208                                           host, req, headers)
1209        self.reset_retry_count()
1210        return retry
1211
1212
1213class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1214
1215    auth_header = 'Proxy-Authorization'
1216    handler_order = 490  # before Basic auth
1217
1218    def http_error_407(self, req, fp, code, msg, headers):
1219        host = req.host
1220        retry = self.http_error_auth_reqed('proxy-authenticate',
1221                                           host, req, headers)
1222        self.reset_retry_count()
1223        return retry
1224
1225class AbstractHTTPHandler(BaseHandler):
1226
1227    def __init__(self, debuglevel=0):
1228        self._debuglevel = debuglevel
1229
1230    def set_http_debuglevel(self, level):
1231        self._debuglevel = level
1232
1233    def _get_content_length(self, request):
1234        return http.client.HTTPConnection._get_content_length(
1235            request.data,
1236            request.get_method())
1237
1238    def do_request_(self, request):
1239        host = request.host
1240        if not host:
1241            raise URLError('no host given')
1242
1243        if request.data is not None:  # POST
1244            data = request.data
1245            if isinstance(data, str):
1246                msg = "POST data should be bytes, an iterable of bytes, " \
1247                      "or a file object. It cannot be of type str."
1248                raise TypeError(msg)
1249            if not request.has_header('Content-type'):
1250                request.add_unredirected_header(
1251                    'Content-type',
1252                    'application/x-www-form-urlencoded')
1253            if (not request.has_header('Content-length')
1254                    and not request.has_header('Transfer-encoding')):
1255                content_length = self._get_content_length(request)
1256                if content_length is not None:
1257                    request.add_unredirected_header(
1258                            'Content-length', str(content_length))
1259                else:
1260                    request.add_unredirected_header(
1261                            'Transfer-encoding', 'chunked')
1262
1263        sel_host = host
1264        if request.has_proxy():
1265            scheme, sel = splittype(request.selector)
1266            sel_host, sel_path = splithost(sel)
1267        if not request.has_header('Host'):
1268            request.add_unredirected_header('Host', sel_host)
1269        for name, value in self.parent.addheaders:
1270            name = name.capitalize()
1271            if not request.has_header(name):
1272                request.add_unredirected_header(name, value)
1273
1274        return request
1275
1276    def do_open(self, http_class, req, **http_conn_args):
1277        """Return an HTTPResponse object for the request, using http_class.
1278
1279        http_class must implement the HTTPConnection API from http.client.
1280        """
1281        host = req.host
1282        if not host:
1283            raise URLError('no host given')
1284
1285        # will parse host:port
1286        h = http_class(host, timeout=req.timeout, **http_conn_args)
1287        h.set_debuglevel(self._debuglevel)
1288
1289        headers = dict(req.unredirected_hdrs)
1290        headers.update(dict((k, v) for k, v in req.headers.items()
1291                            if k not in headers))
1292
1293        # TODO(jhylton): Should this be redesigned to handle
1294        # persistent connections?
1295
1296        # We want to make an HTTP/1.1 request, but the addinfourl
1297        # class isn't prepared to deal with a persistent connection.
1298        # It will try to read all remaining data from the socket,
1299        # which will block while the server waits for the next request.
1300        # So make sure the connection gets closed after the (only)
1301        # request.
1302        headers["Connection"] = "close"
1303        headers = dict((name.title(), val) for name, val in headers.items())
1304
1305        if req._tunnel_host:
1306            tunnel_headers = {}
1307            proxy_auth_hdr = "Proxy-Authorization"
1308            if proxy_auth_hdr in headers:
1309                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1310                # Proxy-Authorization should not be sent to origin
1311                # server.
1312                del headers[proxy_auth_hdr]
1313            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1314
1315        try:
1316            try:
1317                h.request(req.get_method(), req.selector, req.data, headers,
1318                          encode_chunked=req.has_header('Transfer-encoding'))
1319            except OSError as err: # timeout error
1320                raise URLError(err)
1321            r = h.getresponse()
1322        except:
1323            h.close()
1324            raise
1325
1326        # If the server does not send us a 'Connection: close' header,
1327        # HTTPConnection assumes the socket should be left open. Manually
1328        # mark the socket to be closed when this response object goes away.
1329        if h.sock:
1330            h.sock.close()
1331            h.sock = None
1332
1333        r.url = req.get_full_url()
1334        # This line replaces the .msg attribute of the HTTPResponse
1335        # with .headers, because urllib clients expect the response to
1336        # have the reason in .msg.  It would be good to mark this
1337        # attribute is deprecated and get then to use info() or
1338        # .headers.
1339        r.msg = r.reason
1340        return r
1341
1342
1343class HTTPHandler(AbstractHTTPHandler):
1344
1345    def http_open(self, req):
1346        return self.do_open(http.client.HTTPConnection, req)
1347
1348    http_request = AbstractHTTPHandler.do_request_
1349
1350if hasattr(http.client, 'HTTPSConnection'):
1351
1352    class HTTPSHandler(AbstractHTTPHandler):
1353
1354        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1355            AbstractHTTPHandler.__init__(self, debuglevel)
1356            self._context = context
1357            self._check_hostname = check_hostname
1358
1359        def https_open(self, req):
1360            return self.do_open(http.client.HTTPSConnection, req,
1361                context=self._context, check_hostname=self._check_hostname)
1362
1363        https_request = AbstractHTTPHandler.do_request_
1364
1365    __all__.append('HTTPSHandler')
1366
1367class HTTPCookieProcessor(BaseHandler):
1368    def __init__(self, cookiejar=None):
1369        import http.cookiejar
1370        if cookiejar is None:
1371            cookiejar = http.cookiejar.CookieJar()
1372        self.cookiejar = cookiejar
1373
1374    def http_request(self, request):
1375        self.cookiejar.add_cookie_header(request)
1376        return request
1377
1378    def http_response(self, request, response):
1379        self.cookiejar.extract_cookies(response, request)
1380        return response
1381
1382    https_request = http_request
1383    https_response = http_response
1384
1385class UnknownHandler(BaseHandler):
1386    def unknown_open(self, req):
1387        type = req.type
1388        raise URLError('unknown url type: %s' % type)
1389
1390def parse_keqv_list(l):
1391    """Parse list of key=value strings where keys are not duplicated."""
1392    parsed = {}
1393    for elt in l:
1394        k, v = elt.split('=', 1)
1395        if v[0] == '"' and v[-1] == '"':
1396            v = v[1:-1]
1397        parsed[k] = v
1398    return parsed
1399
1400def parse_http_list(s):
1401    """Parse lists as described by RFC 2068 Section 2.
1402
1403    In particular, parse comma-separated lists where the elements of
1404    the list may include quoted-strings.  A quoted-string could
1405    contain a comma.  A non-quoted string could have quotes in the
1406    middle.  Neither commas nor quotes count if they are escaped.
1407    Only double-quotes count, not single-quotes.
1408    """
1409    res = []
1410    part = ''
1411
1412    escape = quote = False
1413    for cur in s:
1414        if escape:
1415            part += cur
1416            escape = False
1417            continue
1418        if quote:
1419            if cur == '\\':
1420                escape = True
1421                continue
1422            elif cur == '"':
1423                quote = False
1424            part += cur
1425            continue
1426
1427        if cur == ',':
1428            res.append(part)
1429            part = ''
1430            continue
1431
1432        if cur == '"':
1433            quote = True
1434
1435        part += cur
1436
1437    # append last part
1438    if part:
1439        res.append(part)
1440
1441    return [part.strip() for part in res]
1442
1443class FileHandler(BaseHandler):
1444    # Use local file or FTP depending on form of URL
1445    def file_open(self, req):
1446        url = req.selector
1447        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1448                req.host != 'localhost'):
1449            if not req.host in self.get_names():
1450                raise URLError("file:// scheme is supported only on localhost")
1451        else:
1452            return self.open_local_file(req)
1453
1454    # names for the localhost
1455    names = None
1456    def get_names(self):
1457        if FileHandler.names is None:
1458            try:
1459                FileHandler.names = tuple(
1460                    socket.gethostbyname_ex('localhost')[2] +
1461                    socket.gethostbyname_ex(socket.gethostname())[2])
1462            except socket.gaierror:
1463                FileHandler.names = (socket.gethostbyname('localhost'),)
1464        return FileHandler.names
1465
1466    # not entirely sure what the rules are here
1467    def open_local_file(self, req):
1468        import email.utils
1469        import mimetypes
1470        host = req.host
1471        filename = req.selector
1472        localfile = url2pathname(filename)
1473        try:
1474            stats = os.stat(localfile)
1475            size = stats.st_size
1476            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1477            mtype = mimetypes.guess_type(filename)[0]
1478            headers = email.message_from_string(
1479                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1480                (mtype or 'text/plain', size, modified))
1481            if host:
1482                host, port = splitport(host)
1483            if not host or \
1484                (not port and _safe_gethostbyname(host) in self.get_names()):
1485                if host:
1486                    origurl = 'file://' + host + filename
1487                else:
1488                    origurl = 'file://' + filename
1489                return addinfourl(open(localfile, 'rb'), headers, origurl)
1490        except OSError as exp:
1491            # users shouldn't expect OSErrors coming from urlopen()
1492            raise URLError(exp)
1493        raise URLError('file not on local host')
1494
1495def _safe_gethostbyname(host):
1496    try:
1497        return socket.gethostbyname(host)
1498    except socket.gaierror:
1499        return None
1500
1501class FTPHandler(BaseHandler):
1502    def ftp_open(self, req):
1503        import ftplib
1504        import mimetypes
1505        host = req.host
1506        if not host:
1507            raise URLError('ftp error: no host given')
1508        host, port = splitport(host)
1509        if port is None:
1510            port = ftplib.FTP_PORT
1511        else:
1512            port = int(port)
1513
1514        # username/password handling
1515        user, host = splituser(host)
1516        if user:
1517            user, passwd = splitpasswd(user)
1518        else:
1519            passwd = None
1520        host = unquote(host)
1521        user = user or ''
1522        passwd = passwd or ''
1523
1524        try:
1525            host = socket.gethostbyname(host)
1526        except OSError as msg:
1527            raise URLError(msg)
1528        path, attrs = splitattr(req.selector)
1529        dirs = path.split('/')
1530        dirs = list(map(unquote, dirs))
1531        dirs, file = dirs[:-1], dirs[-1]
1532        if dirs and not dirs[0]:
1533            dirs = dirs[1:]
1534        try:
1535            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1536            type = file and 'I' or 'D'
1537            for attr in attrs:
1538                attr, value = splitvalue(attr)
1539                if attr.lower() == 'type' and \
1540                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1541                    type = value.upper()
1542            fp, retrlen = fw.retrfile(file, type)
1543            headers = ""
1544            mtype = mimetypes.guess_type(req.full_url)[0]
1545            if mtype:
1546                headers += "Content-type: %s\n" % mtype
1547            if retrlen is not None and retrlen >= 0:
1548                headers += "Content-length: %d\n" % retrlen
1549            headers = email.message_from_string(headers)
1550            return addinfourl(fp, headers, req.full_url)
1551        except ftplib.all_errors as exp:
1552            exc = URLError('ftp error: %r' % exp)
1553            raise exc.with_traceback(sys.exc_info()[2])
1554
1555    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1556        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1557                          persistent=False)
1558
1559class CacheFTPHandler(FTPHandler):
1560    # XXX would be nice to have pluggable cache strategies
1561    # XXX this stuff is definitely not thread safe
1562    def __init__(self):
1563        self.cache = {}
1564        self.timeout = {}
1565        self.soonest = 0
1566        self.delay = 60
1567        self.max_conns = 16
1568
1569    def setTimeout(self, t):
1570        self.delay = t
1571
1572    def setMaxConns(self, m):
1573        self.max_conns = m
1574
1575    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1576        key = user, host, port, '/'.join(dirs), timeout
1577        if key in self.cache:
1578            self.timeout[key] = time.time() + self.delay
1579        else:
1580            self.cache[key] = ftpwrapper(user, passwd, host, port,
1581                                         dirs, timeout)
1582            self.timeout[key] = time.time() + self.delay
1583        self.check_cache()
1584        return self.cache[key]
1585
1586    def check_cache(self):
1587        # first check for old ones
1588        t = time.time()
1589        if self.soonest <= t:
1590            for k, v in list(self.timeout.items()):
1591                if v < t:
1592                    self.cache[k].close()
1593                    del self.cache[k]
1594                    del self.timeout[k]
1595        self.soonest = min(list(self.timeout.values()))
1596
1597        # then check the size
1598        if len(self.cache) == self.max_conns:
1599            for k, v in list(self.timeout.items()):
1600                if v == self.soonest:
1601                    del self.cache[k]
1602                    del self.timeout[k]
1603                    break
1604            self.soonest = min(list(self.timeout.values()))
1605
1606    def clear_cache(self):
1607        for conn in self.cache.values():
1608            conn.close()
1609        self.cache.clear()
1610        self.timeout.clear()
1611
1612class DataHandler(BaseHandler):
1613    def data_open(self, req):
1614        # data URLs as specified in RFC 2397.
1615        #
1616        # ignores POSTed data
1617        #
1618        # syntax:
1619        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1620        # mediatype := [ type "/" subtype ] *( ";" parameter )
1621        # data      := *urlchar
1622        # parameter := attribute "=" value
1623        url = req.full_url
1624
1625        scheme, data = url.split(":",1)
1626        mediatype, data = data.split(",",1)
1627
1628        # even base64 encoded data URLs might be quoted so unquote in any case:
1629        data = unquote_to_bytes(data)
1630        if mediatype.endswith(";base64"):
1631            data = base64.decodebytes(data)
1632            mediatype = mediatype[:-7]
1633
1634        if not mediatype:
1635            mediatype = "text/plain;charset=US-ASCII"
1636
1637        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1638            (mediatype, len(data)))
1639
1640        return addinfourl(io.BytesIO(data), headers, url)
1641
1642
1643# Code move from the old urllib module
1644
1645MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1646
1647# Helper for non-unix systems
1648if os.name == 'nt':
1649    from nturl2path import url2pathname, pathname2url
1650else:
1651    def url2pathname(pathname):
1652        """OS-specific conversion from a relative URL of the 'file' scheme
1653        to a file system path; not recommended for general use."""
1654        return unquote(pathname)
1655
1656    def pathname2url(pathname):
1657        """OS-specific conversion from a file system path to a relative URL
1658        of the 'file' scheme; not recommended for general use."""
1659        return quote(pathname)
1660
1661# This really consists of two pieces:
1662# (1) a class which handles opening of all sorts of URLs
1663#     (plus assorted utilities etc.)
1664# (2) a set of functions for parsing URLs
1665# XXX Should these be separated out into different modules?
1666
1667
1668ftpcache = {}
1669class URLopener:
1670    """Class to open URLs.
1671    This is a class rather than just a subroutine because we may need
1672    more than one set of global protocol-specific options.
1673    Note -- this is a base class for those who don't want the
1674    automatic handling of errors type 302 (relocated) and 401
1675    (authorization needed)."""
1676
1677    __tempfiles = None
1678
1679    version = "Python-urllib/%s" % __version__
1680
1681    # Constructor
1682    def __init__(self, proxies=None, **x509):
1683        msg = "%(class)s style of invoking requests is deprecated. " \
1684              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1685        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1686        if proxies is None:
1687            proxies = getproxies()
1688        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1689        self.proxies = proxies
1690        self.key_file = x509.get('key_file')
1691        self.cert_file = x509.get('cert_file')
1692        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1693        self.__tempfiles = []
1694        self.__unlink = os.unlink # See cleanup()
1695        self.tempcache = None
1696        # Undocumented feature: if you assign {} to tempcache,
1697        # it is used to cache files retrieved with
1698        # self.retrieve().  This is not enabled by default
1699        # since it does not work for changing documents (and I
1700        # haven't got the logic to check expiration headers
1701        # yet).
1702        self.ftpcache = ftpcache
1703        # Undocumented feature: you can use a different
1704        # ftp cache by assigning to the .ftpcache member;
1705        # in case you want logically independent URL openers
1706        # XXX This is not threadsafe.  Bah.
1707
1708    def __del__(self):
1709        self.close()
1710
1711    def close(self):
1712        self.cleanup()
1713
1714    def cleanup(self):
1715        # This code sometimes runs when the rest of this module
1716        # has already been deleted, so it can't use any globals
1717        # or import anything.
1718        if self.__tempfiles:
1719            for file in self.__tempfiles:
1720                try:
1721                    self.__unlink(file)
1722                except OSError:
1723                    pass
1724            del self.__tempfiles[:]
1725        if self.tempcache:
1726            self.tempcache.clear()
1727
1728    def addheader(self, *args):
1729        """Add a header to be used by the HTTP interface only
1730        e.g. u.addheader('Accept', 'sound/basic')"""
1731        self.addheaders.append(args)
1732
1733    # External interface
1734    def open(self, fullurl, data=None):
1735        """Use URLopener().open(file) instead of open(file, 'r')."""
1736        fullurl = unwrap(to_bytes(fullurl))
1737        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1738        if self.tempcache and fullurl in self.tempcache:
1739            filename, headers = self.tempcache[fullurl]
1740            fp = open(filename, 'rb')
1741            return addinfourl(fp, headers, fullurl)
1742        urltype, url = splittype(fullurl)
1743        if not urltype:
1744            urltype = 'file'
1745        if urltype in self.proxies:
1746            proxy = self.proxies[urltype]
1747            urltype, proxyhost = splittype(proxy)
1748            host, selector = splithost(proxyhost)
1749            url = (host, fullurl) # Signal special case to open_*()
1750        else:
1751            proxy = None
1752        name = 'open_' + urltype
1753        self.type = urltype
1754        name = name.replace('-', '_')
1755        if not hasattr(self, name):
1756            if proxy:
1757                return self.open_unknown_proxy(proxy, fullurl, data)
1758            else:
1759                return self.open_unknown(fullurl, data)
1760        try:
1761            if data is None:
1762                return getattr(self, name)(url)
1763            else:
1764                return getattr(self, name)(url, data)
1765        except (HTTPError, URLError):
1766            raise
1767        except OSError as msg:
1768            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1769
1770    def open_unknown(self, fullurl, data=None):
1771        """Overridable interface to open unknown URL type."""
1772        type, url = splittype(fullurl)
1773        raise OSError('url error', 'unknown url type', type)
1774
1775    def open_unknown_proxy(self, proxy, fullurl, data=None):
1776        """Overridable interface to open unknown URL type."""
1777        type, url = splittype(fullurl)
1778        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1779
1780    # External interface
1781    def retrieve(self, url, filename=None, reporthook=None, data=None):
1782        """retrieve(url) returns (filename, headers) for a local object
1783        or (tempfilename, headers) for a remote object."""
1784        url = unwrap(to_bytes(url))
1785        if self.tempcache and url in self.tempcache:
1786            return self.tempcache[url]
1787        type, url1 = splittype(url)
1788        if filename is None and (not type or type == 'file'):
1789            try:
1790                fp = self.open_local_file(url1)
1791                hdrs = fp.info()
1792                fp.close()
1793                return url2pathname(splithost(url1)[1]), hdrs
1794            except OSError as msg:
1795                pass
1796        fp = self.open(url, data)
1797        try:
1798            headers = fp.info()
1799            if filename:
1800                tfp = open(filename, 'wb')
1801            else:
1802                import tempfile
1803                garbage, path = splittype(url)
1804                garbage, path = splithost(path or "")
1805                path, garbage = splitquery(path or "")
1806                path, garbage = splitattr(path or "")
1807                suffix = os.path.splitext(path)[1]
1808                (fd, filename) = tempfile.mkstemp(suffix)
1809                self.__tempfiles.append(filename)
1810                tfp = os.fdopen(fd, 'wb')
1811            try:
1812                result = filename, headers
1813                if self.tempcache is not None:
1814                    self.tempcache[url] = result
1815                bs = 1024*8
1816                size = -1
1817                read = 0
1818                blocknum = 0
1819                if "content-length" in headers:
1820                    size = int(headers["Content-Length"])
1821                if reporthook:
1822                    reporthook(blocknum, bs, size)
1823                while 1:
1824                    block = fp.read(bs)
1825                    if not block:
1826                        break
1827                    read += len(block)
1828                    tfp.write(block)
1829                    blocknum += 1
1830                    if reporthook:
1831                        reporthook(blocknum, bs, size)
1832            finally:
1833                tfp.close()
1834        finally:
1835            fp.close()
1836
1837        # raise exception if actual size does not match content-length header
1838        if size >= 0 and read < size:
1839            raise ContentTooShortError(
1840                "retrieval incomplete: got only %i out of %i bytes"
1841                % (read, size), result)
1842
1843        return result
1844
1845    # Each method named open_<type> knows how to open that type of URL
1846
1847    def _open_generic_http(self, connection_factory, url, data):
1848        """Make an HTTP connection using connection_class.
1849
1850        This is an internal method that should be called from
1851        open_http() or open_https().
1852
1853        Arguments:
1854        - connection_factory should take a host name and return an
1855          HTTPConnection instance.
1856        - url is the url to retrieval or a host, relative-path pair.
1857        - data is payload for a POST request or None.
1858        """
1859
1860        user_passwd = None
1861        proxy_passwd= None
1862        if isinstance(url, str):
1863            host, selector = splithost(url)
1864            if host:
1865                user_passwd, host = splituser(host)
1866                host = unquote(host)
1867            realhost = host
1868        else:
1869            host, selector = url
1870            # check whether the proxy contains authorization information
1871            proxy_passwd, host = splituser(host)
1872            # now we proceed with the url we want to obtain
1873            urltype, rest = splittype(selector)
1874            url = rest
1875            user_passwd = None
1876            if urltype.lower() != 'http':
1877                realhost = None
1878            else:
1879                realhost, rest = splithost(rest)
1880                if realhost:
1881                    user_passwd, realhost = splituser(realhost)
1882                if user_passwd:
1883                    selector = "%s://%s%s" % (urltype, realhost, rest)
1884                if proxy_bypass(realhost):
1885                    host = realhost
1886
1887        if not host: raise OSError('http error', 'no host given')
1888
1889        if proxy_passwd:
1890            proxy_passwd = unquote(proxy_passwd)
1891            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1892        else:
1893            proxy_auth = None
1894
1895        if user_passwd:
1896            user_passwd = unquote(user_passwd)
1897            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1898        else:
1899            auth = None
1900        http_conn = connection_factory(host)
1901        headers = {}
1902        if proxy_auth:
1903            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1904        if auth:
1905            headers["Authorization"] =  "Basic %s" % auth
1906        if realhost:
1907            headers["Host"] = realhost
1908
1909        # Add Connection:close as we don't support persistent connections yet.
1910        # This helps in closing the socket and avoiding ResourceWarning
1911
1912        headers["Connection"] = "close"
1913
1914        for header, value in self.addheaders:
1915            headers[header] = value
1916
1917        if data is not None:
1918            headers["Content-Type"] = "application/x-www-form-urlencoded"
1919            http_conn.request("POST", selector, data, headers)
1920        else:
1921            http_conn.request("GET", selector, headers=headers)
1922
1923        try:
1924            response = http_conn.getresponse()
1925        except http.client.BadStatusLine:
1926            # something went wrong with the HTTP status line
1927            raise URLError("http protocol error: bad status line")
1928
1929        # According to RFC 2616, "2xx" code indicates that the client's
1930        # request was successfully received, understood, and accepted.
1931        if 200 <= response.status < 300:
1932            return addinfourl(response, response.msg, "http:" + url,
1933                              response.status)
1934        else:
1935            return self.http_error(
1936                url, response.fp,
1937                response.status, response.reason, response.msg, data)
1938
1939    def open_http(self, url, data=None):
1940        """Use HTTP protocol."""
1941        return self._open_generic_http(http.client.HTTPConnection, url, data)
1942
1943    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1944        """Handle http errors.
1945
1946        Derived class can override this, or provide specific handlers
1947        named http_error_DDD where DDD is the 3-digit error code."""
1948        # First check if there's a specific handler for this error
1949        name = 'http_error_%d' % errcode
1950        if hasattr(self, name):
1951            method = getattr(self, name)
1952            if data is None:
1953                result = method(url, fp, errcode, errmsg, headers)
1954            else:
1955                result = method(url, fp, errcode, errmsg, headers, data)
1956            if result: return result
1957        return self.http_error_default(url, fp, errcode, errmsg, headers)
1958
1959    def http_error_default(self, url, fp, errcode, errmsg, headers):
1960        """Default error handler: close the connection and raise OSError."""
1961        fp.close()
1962        raise HTTPError(url, errcode, errmsg, headers, None)
1963
1964    if _have_ssl:
1965        def _https_connection(self, host):
1966            return http.client.HTTPSConnection(host,
1967                                           key_file=self.key_file,
1968                                           cert_file=self.cert_file)
1969
1970        def open_https(self, url, data=None):
1971            """Use HTTPS protocol."""
1972            return self._open_generic_http(self._https_connection, url, data)
1973
1974    def open_file(self, url):
1975        """Use local file or FTP depending on form of URL."""
1976        if not isinstance(url, str):
1977            raise URLError('file error: proxy support for file protocol currently not implemented')
1978        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1979            raise ValueError("file:// scheme is supported only on localhost")
1980        else:
1981            return self.open_local_file(url)
1982
1983    def open_local_file(self, url):
1984        """Use local file."""
1985        import email.utils
1986        import mimetypes
1987        host, file = splithost(url)
1988        localname = url2pathname(file)
1989        try:
1990            stats = os.stat(localname)
1991        except OSError as e:
1992            raise URLError(e.strerror, e.filename)
1993        size = stats.st_size
1994        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1995        mtype = mimetypes.guess_type(url)[0]
1996        headers = email.message_from_string(
1997            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1998            (mtype or 'text/plain', size, modified))
1999        if not host:
2000            urlfile = file
2001            if file[:1] == '/':
2002                urlfile = 'file://' + file
2003            return addinfourl(open(localname, 'rb'), headers, urlfile)
2004        host, port = splitport(host)
2005        if (not port
2006           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2007            urlfile = file
2008            if file[:1] == '/':
2009                urlfile = 'file://' + file
2010            elif file[:2] == './':
2011                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2012            return addinfourl(open(localname, 'rb'), headers, urlfile)
2013        raise URLError('local file error: not on local host')
2014
2015    def open_ftp(self, url):
2016        """Use FTP protocol."""
2017        if not isinstance(url, str):
2018            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2019        import mimetypes
2020        host, path = splithost(url)
2021        if not host: raise URLError('ftp error: no host given')
2022        host, port = splitport(host)
2023        user, host = splituser(host)
2024        if user: user, passwd = splitpasswd(user)
2025        else: passwd = None
2026        host = unquote(host)
2027        user = unquote(user or '')
2028        passwd = unquote(passwd or '')
2029        host = socket.gethostbyname(host)
2030        if not port:
2031            import ftplib
2032            port = ftplib.FTP_PORT
2033        else:
2034            port = int(port)
2035        path, attrs = splitattr(path)
2036        path = unquote(path)
2037        dirs = path.split('/')
2038        dirs, file = dirs[:-1], dirs[-1]
2039        if dirs and not dirs[0]: dirs = dirs[1:]
2040        if dirs and not dirs[0]: dirs[0] = '/'
2041        key = user, host, port, '/'.join(dirs)
2042        # XXX thread unsafe!
2043        if len(self.ftpcache) > MAXFTPCACHE:
2044            # Prune the cache, rather arbitrarily
2045            for k in list(self.ftpcache):
2046                if k != key:
2047                    v = self.ftpcache[k]
2048                    del self.ftpcache[k]
2049                    v.close()
2050        try:
2051            if key not in self.ftpcache:
2052                self.ftpcache[key] = \
2053                    ftpwrapper(user, passwd, host, port, dirs)
2054            if not file: type = 'D'
2055            else: type = 'I'
2056            for attr in attrs:
2057                attr, value = splitvalue(attr)
2058                if attr.lower() == 'type' and \
2059                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2060                    type = value.upper()
2061            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2062            mtype = mimetypes.guess_type("ftp:" + url)[0]
2063            headers = ""
2064            if mtype:
2065                headers += "Content-Type: %s\n" % mtype
2066            if retrlen is not None and retrlen >= 0:
2067                headers += "Content-Length: %d\n" % retrlen
2068            headers = email.message_from_string(headers)
2069            return addinfourl(fp, headers, "ftp:" + url)
2070        except ftperrors() as exp:
2071            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2072
2073    def open_data(self, url, data=None):
2074        """Use "data" URL."""
2075        if not isinstance(url, str):
2076            raise URLError('data error: proxy support for data protocol currently not implemented')
2077        # ignore POSTed data
2078        #
2079        # syntax of data URLs:
2080        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2081        # mediatype := [ type "/" subtype ] *( ";" parameter )
2082        # data      := *urlchar
2083        # parameter := attribute "=" value
2084        try:
2085            [type, data] = url.split(',', 1)
2086        except ValueError:
2087            raise OSError('data error', 'bad data URL')
2088        if not type:
2089            type = 'text/plain;charset=US-ASCII'
2090        semi = type.rfind(';')
2091        if semi >= 0 and '=' not in type[semi:]:
2092            encoding = type[semi+1:]
2093            type = type[:semi]
2094        else:
2095            encoding = ''
2096        msg = []
2097        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2098                                            time.gmtime(time.time())))
2099        msg.append('Content-type: %s' % type)
2100        if encoding == 'base64':
2101            # XXX is this encoding/decoding ok?
2102            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2103        else:
2104            data = unquote(data)
2105        msg.append('Content-Length: %d' % len(data))
2106        msg.append('')
2107        msg.append(data)
2108        msg = '\n'.join(msg)
2109        headers = email.message_from_string(msg)
2110        f = io.StringIO(msg)
2111        #f.fileno = None     # needed for addinfourl
2112        return addinfourl(f, headers, url)
2113
2114
2115class FancyURLopener(URLopener):
2116    """Derived class with handlers for errors we can handle (perhaps)."""
2117
2118    def __init__(self, *args, **kwargs):
2119        URLopener.__init__(self, *args, **kwargs)
2120        self.auth_cache = {}
2121        self.tries = 0
2122        self.maxtries = 10
2123
2124    def http_error_default(self, url, fp, errcode, errmsg, headers):
2125        """Default error handling -- don't raise an exception."""
2126        return addinfourl(fp, headers, "http:" + url, errcode)
2127
2128    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2129        """Error 302 -- relocated (temporarily)."""
2130        self.tries += 1
2131        try:
2132            if self.maxtries and self.tries >= self.maxtries:
2133                if hasattr(self, "http_error_500"):
2134                    meth = self.http_error_500
2135                else:
2136                    meth = self.http_error_default
2137                return meth(url, fp, 500,
2138                            "Internal Server Error: Redirect Recursion",
2139                            headers)
2140            result = self.redirect_internal(url, fp, errcode, errmsg,
2141                                            headers, data)
2142            return result
2143        finally:
2144            self.tries = 0
2145
2146    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2147        if 'location' in headers:
2148            newurl = headers['location']
2149        elif 'uri' in headers:
2150            newurl = headers['uri']
2151        else:
2152            return
2153        fp.close()
2154
2155        # In case the server sent a relative URL, join with original:
2156        newurl = urljoin(self.type + ":" + url, newurl)
2157
2158        urlparts = urlparse(newurl)
2159
2160        # For security reasons, we don't allow redirection to anything other
2161        # than http, https and ftp.
2162
2163        # We are using newer HTTPError with older redirect_internal method
2164        # This older method will get deprecated in 3.3
2165
2166        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2167            raise HTTPError(newurl, errcode,
2168                            errmsg +
2169                            " Redirection to url '%s' is not allowed." % newurl,
2170                            headers, fp)
2171
2172        return self.open(newurl)
2173
2174    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2175        """Error 301 -- also relocated (permanently)."""
2176        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2177
2178    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2179        """Error 303 -- also relocated (essentially identical to 302)."""
2180        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2181
2182    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2183        """Error 307 -- relocated, but turn POST into error."""
2184        if data is None:
2185            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2186        else:
2187            return self.http_error_default(url, fp, errcode, errmsg, headers)
2188
2189    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2190            retry=False):
2191        """Error 401 -- authentication required.
2192        This function supports Basic authentication only."""
2193        if 'www-authenticate' not in headers:
2194            URLopener.http_error_default(self, url, fp,
2195                                         errcode, errmsg, headers)
2196        stuff = headers['www-authenticate']
2197        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2198        if not match:
2199            URLopener.http_error_default(self, url, fp,
2200                                         errcode, errmsg, headers)
2201        scheme, realm = match.groups()
2202        if scheme.lower() != 'basic':
2203            URLopener.http_error_default(self, url, fp,
2204                                         errcode, errmsg, headers)
2205        if not retry:
2206            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2207                    headers)
2208        name = 'retry_' + self.type + '_basic_auth'
2209        if data is None:
2210            return getattr(self,name)(url, realm)
2211        else:
2212            return getattr(self,name)(url, realm, data)
2213
2214    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2215            retry=False):
2216        """Error 407 -- proxy authentication required.
2217        This function supports Basic authentication only."""
2218        if 'proxy-authenticate' not in headers:
2219            URLopener.http_error_default(self, url, fp,
2220                                         errcode, errmsg, headers)
2221        stuff = headers['proxy-authenticate']
2222        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2223        if not match:
2224            URLopener.http_error_default(self, url, fp,
2225                                         errcode, errmsg, headers)
2226        scheme, realm = match.groups()
2227        if scheme.lower() != 'basic':
2228            URLopener.http_error_default(self, url, fp,
2229                                         errcode, errmsg, headers)
2230        if not retry:
2231            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2232                    headers)
2233        name = 'retry_proxy_' + self.type + '_basic_auth'
2234        if data is None:
2235            return getattr(self,name)(url, realm)
2236        else:
2237            return getattr(self,name)(url, realm, data)
2238
2239    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2240        host, selector = splithost(url)
2241        newurl = 'http://' + host + selector
2242        proxy = self.proxies['http']
2243        urltype, proxyhost = splittype(proxy)
2244        proxyhost, proxyselector = splithost(proxyhost)
2245        i = proxyhost.find('@') + 1
2246        proxyhost = proxyhost[i:]
2247        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2248        if not (user or passwd): return None
2249        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2250                                  quote(passwd, safe=''), proxyhost)
2251        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2252        if data is None:
2253            return self.open(newurl)
2254        else:
2255            return self.open(newurl, data)
2256
2257    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2258        host, selector = splithost(url)
2259        newurl = 'https://' + host + selector
2260        proxy = self.proxies['https']
2261        urltype, proxyhost = splittype(proxy)
2262        proxyhost, proxyselector = splithost(proxyhost)
2263        i = proxyhost.find('@') + 1
2264        proxyhost = proxyhost[i:]
2265        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2266        if not (user or passwd): return None
2267        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2268                                  quote(passwd, safe=''), proxyhost)
2269        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2270        if data is None:
2271            return self.open(newurl)
2272        else:
2273            return self.open(newurl, data)
2274
2275    def retry_http_basic_auth(self, url, realm, data=None):
2276        host, selector = splithost(url)
2277        i = host.find('@') + 1
2278        host = host[i:]
2279        user, passwd = self.get_user_passwd(host, realm, i)
2280        if not (user or passwd): return None
2281        host = "%s:%s@%s" % (quote(user, safe=''),
2282                             quote(passwd, safe=''), host)
2283        newurl = 'http://' + host + selector
2284        if data is None:
2285            return self.open(newurl)
2286        else:
2287            return self.open(newurl, data)
2288
2289    def retry_https_basic_auth(self, url, realm, data=None):
2290        host, selector = splithost(url)
2291        i = host.find('@') + 1
2292        host = host[i:]
2293        user, passwd = self.get_user_passwd(host, realm, i)
2294        if not (user or passwd): return None
2295        host = "%s:%s@%s" % (quote(user, safe=''),
2296                             quote(passwd, safe=''), host)
2297        newurl = 'https://' + host + selector
2298        if data is None:
2299            return self.open(newurl)
2300        else:
2301            return self.open(newurl, data)
2302
2303    def get_user_passwd(self, host, realm, clear_cache=0):
2304        key = realm + '@' + host.lower()
2305        if key in self.auth_cache:
2306            if clear_cache:
2307                del self.auth_cache[key]
2308            else:
2309                return self.auth_cache[key]
2310        user, passwd = self.prompt_user_passwd(host, realm)
2311        if user or passwd: self.auth_cache[key] = (user, passwd)
2312        return user, passwd
2313
2314    def prompt_user_passwd(self, host, realm):
2315        """Override this in a GUI environment!"""
2316        import getpass
2317        try:
2318            user = input("Enter username for %s at %s: " % (realm, host))
2319            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2320                (user, realm, host))
2321            return user, passwd
2322        except KeyboardInterrupt:
2323            print()
2324            return None, None
2325
2326
2327# Utility functions
2328
2329_localhost = None
2330def localhost():
2331    """Return the IP address of the magic hostname 'localhost'."""
2332    global _localhost
2333    if _localhost is None:
2334        _localhost = socket.gethostbyname('localhost')
2335    return _localhost
2336
2337_thishost = None
2338def thishost():
2339    """Return the IP addresses of the current host."""
2340    global _thishost
2341    if _thishost is None:
2342        try:
2343            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2344        except socket.gaierror:
2345            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2346    return _thishost
2347
2348_ftperrors = None
2349def ftperrors():
2350    """Return the set of errors raised by the FTP class."""
2351    global _ftperrors
2352    if _ftperrors is None:
2353        import ftplib
2354        _ftperrors = ftplib.all_errors
2355    return _ftperrors
2356
2357_noheaders = None
2358def noheaders():
2359    """Return an empty email Message object."""
2360    global _noheaders
2361    if _noheaders is None:
2362        _noheaders = email.message_from_string("")
2363    return _noheaders
2364
2365
2366# Utility classes
2367
2368class ftpwrapper:
2369    """Class used by open_ftp() for cache of open FTP connections."""
2370
2371    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2372                 persistent=True):
2373        self.user = user
2374        self.passwd = passwd
2375        self.host = host
2376        self.port = port
2377        self.dirs = dirs
2378        self.timeout = timeout
2379        self.refcount = 0
2380        self.keepalive = persistent
2381        try:
2382            self.init()
2383        except:
2384            self.close()
2385            raise
2386
2387    def init(self):
2388        import ftplib
2389        self.busy = 0
2390        self.ftp = ftplib.FTP()
2391        self.ftp.connect(self.host, self.port, self.timeout)
2392        self.ftp.login(self.user, self.passwd)
2393        _target = '/'.join(self.dirs)
2394        self.ftp.cwd(_target)
2395
2396    def retrfile(self, file, type):
2397        import ftplib
2398        self.endtransfer()
2399        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2400        else: cmd = 'TYPE ' + type; isdir = 0
2401        try:
2402            self.ftp.voidcmd(cmd)
2403        except ftplib.all_errors:
2404            self.init()
2405            self.ftp.voidcmd(cmd)
2406        conn = None
2407        if file and not isdir:
2408            # Try to retrieve as a file
2409            try:
2410                cmd = 'RETR ' + file
2411                conn, retrlen = self.ftp.ntransfercmd(cmd)
2412            except ftplib.error_perm as reason:
2413                if str(reason)[:3] != '550':
2414                    raise URLError('ftp error: %r' % reason).with_traceback(
2415                        sys.exc_info()[2])
2416        if not conn:
2417            # Set transfer mode to ASCII!
2418            self.ftp.voidcmd('TYPE A')
2419            # Try a directory listing. Verify that directory exists.
2420            if file:
2421                pwd = self.ftp.pwd()
2422                try:
2423                    try:
2424                        self.ftp.cwd(file)
2425                    except ftplib.error_perm as reason:
2426                        raise URLError('ftp error: %r' % reason) from reason
2427                finally:
2428                    self.ftp.cwd(pwd)
2429                cmd = 'LIST ' + file
2430            else:
2431                cmd = 'LIST'
2432            conn, retrlen = self.ftp.ntransfercmd(cmd)
2433        self.busy = 1
2434
2435        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2436        self.refcount += 1
2437        conn.close()
2438        # Pass back both a suitably decorated object and a retrieval length
2439        return (ftpobj, retrlen)
2440
2441    def endtransfer(self):
2442        self.busy = 0
2443
2444    def close(self):
2445        self.keepalive = False
2446        if self.refcount <= 0:
2447            self.real_close()
2448
2449    def file_close(self):
2450        self.endtransfer()
2451        self.refcount -= 1
2452        if self.refcount <= 0 and not self.keepalive:
2453            self.real_close()
2454
2455    def real_close(self):
2456        self.endtransfer()
2457        try:
2458            self.ftp.close()
2459        except ftperrors():
2460            pass
2461
2462# Proxy handling
2463def getproxies_environment():
2464    """Return a dictionary of scheme -> proxy server URL mappings.
2465
2466    Scan the environment for variables named <scheme>_proxy;
2467    this seems to be the standard convention.  If you need a
2468    different way, you can pass a proxies dictionary to the
2469    [Fancy]URLopener constructor.
2470
2471    """
2472    proxies = {}
2473    # in order to prefer lowercase variables, process environment in
2474    # two passes: first matches any, second pass matches lowercase only
2475    for name, value in os.environ.items():
2476        name = name.lower()
2477        if value and name[-6:] == '_proxy':
2478            proxies[name[:-6]] = value
2479    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2480    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2481    # header from the client
2482    # If "proxy" is lowercase, it will still be used thanks to the next block
2483    if 'REQUEST_METHOD' in os.environ:
2484        proxies.pop('http', None)
2485    for name, value in os.environ.items():
2486        if name[-6:] == '_proxy':
2487            name = name.lower()
2488            if value:
2489                proxies[name[:-6]] = value
2490            else:
2491                proxies.pop(name[:-6], None)
2492    return proxies
2493
2494def proxy_bypass_environment(host, proxies=None):
2495    """Test if proxies should not be used for a particular host.
2496
2497    Checks the proxy dict for the value of no_proxy, which should
2498    be a list of comma separated DNS suffixes, or '*' for all hosts.
2499
2500    """
2501    if proxies is None:
2502        proxies = getproxies_environment()
2503    # don't bypass, if no_proxy isn't specified
2504    try:
2505        no_proxy = proxies['no']
2506    except KeyError:
2507        return 0
2508    # '*' is special case for always bypass
2509    if no_proxy == '*':
2510        return 1
2511    # strip port off host
2512    hostonly, port = splitport(host)
2513    # check if the host ends with any of the DNS suffixes
2514    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2515    for name in no_proxy_list:
2516        if name:
2517            name = name.lstrip('.')  # ignore leading dots
2518            name = re.escape(name)
2519            pattern = r'(.+\.)?%s$' % name
2520            if (re.match(pattern, hostonly, re.I)
2521                    or re.match(pattern, host, re.I)):
2522                return 1
2523    # otherwise, don't bypass
2524    return 0
2525
2526
2527# This code tests an OSX specific data structure but is testable on all
2528# platforms
2529def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2530    """
2531    Return True iff this host shouldn't be accessed using a proxy
2532
2533    This function uses the MacOSX framework SystemConfiguration
2534    to fetch the proxy information.
2535
2536    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2537    { 'exclude_simple': bool,
2538      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2539    }
2540    """
2541    from fnmatch import fnmatch
2542
2543    hostonly, port = splitport(host)
2544
2545    def ip2num(ipAddr):
2546        parts = ipAddr.split('.')
2547        parts = list(map(int, parts))
2548        if len(parts) != 4:
2549            parts = (parts + [0, 0, 0, 0])[:4]
2550        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2551
2552    # Check for simple host names:
2553    if '.' not in host:
2554        if proxy_settings['exclude_simple']:
2555            return True
2556
2557    hostIP = None
2558
2559    for value in proxy_settings.get('exceptions', ()):
2560        # Items in the list are strings like these: *.local, 169.254/16
2561        if not value: continue
2562
2563        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2564        if m is not None:
2565            if hostIP is None:
2566                try:
2567                    hostIP = socket.gethostbyname(hostonly)
2568                    hostIP = ip2num(hostIP)
2569                except OSError:
2570                    continue
2571
2572            base = ip2num(m.group(1))
2573            mask = m.group(2)
2574            if mask is None:
2575                mask = 8 * (m.group(1).count('.') + 1)
2576            else:
2577                mask = int(mask[1:])
2578            mask = 32 - mask
2579
2580            if (hostIP >> mask) == (base >> mask):
2581                return True
2582
2583        elif fnmatch(host, value):
2584            return True
2585
2586    return False
2587
2588
2589if sys.platform == 'darwin':
2590    from _scproxy import _get_proxy_settings, _get_proxies
2591
2592    def proxy_bypass_macosx_sysconf(host):
2593        proxy_settings = _get_proxy_settings()
2594        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2595
2596    def getproxies_macosx_sysconf():
2597        """Return a dictionary of scheme -> proxy server URL mappings.
2598
2599        This function uses the MacOSX framework SystemConfiguration
2600        to fetch the proxy information.
2601        """
2602        return _get_proxies()
2603
2604
2605
2606    def proxy_bypass(host):
2607        """Return True, if host should be bypassed.
2608
2609        Checks proxy settings gathered from the environment, if specified,
2610        or from the MacOSX framework SystemConfiguration.
2611
2612        """
2613        proxies = getproxies_environment()
2614        if proxies:
2615            return proxy_bypass_environment(host, proxies)
2616        else:
2617            return proxy_bypass_macosx_sysconf(host)
2618
2619    def getproxies():
2620        return getproxies_environment() or getproxies_macosx_sysconf()
2621
2622
2623elif os.name == 'nt':
2624    def getproxies_registry():
2625        """Return a dictionary of scheme -> proxy server URL mappings.
2626
2627        Win32 uses the registry to store proxies.
2628
2629        """
2630        proxies = {}
2631        try:
2632            import winreg
2633        except ImportError:
2634            # Std module, so should be around - but you never know!
2635            return proxies
2636        try:
2637            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2638                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2639            proxyEnable = winreg.QueryValueEx(internetSettings,
2640                                               'ProxyEnable')[0]
2641            if proxyEnable:
2642                # Returned as Unicode but problems if not converted to ASCII
2643                proxyServer = str(winreg.QueryValueEx(internetSettings,
2644                                                       'ProxyServer')[0])
2645                if '=' in proxyServer:
2646                    # Per-protocol settings
2647                    for p in proxyServer.split(';'):
2648                        protocol, address = p.split('=', 1)
2649                        # See if address has a type:// prefix
2650                        if not re.match('^([^/:]+)://', address):
2651                            address = '%s://%s' % (protocol, address)
2652                        proxies[protocol] = address
2653                else:
2654                    # Use one setting for all protocols
2655                    if proxyServer[:5] == 'http:':
2656                        proxies['http'] = proxyServer
2657                    else:
2658                        proxies['http'] = 'http://%s' % proxyServer
2659                        proxies['https'] = 'https://%s' % proxyServer
2660                        proxies['ftp'] = 'ftp://%s' % proxyServer
2661            internetSettings.Close()
2662        except (OSError, ValueError, TypeError):
2663            # Either registry key not found etc, or the value in an
2664            # unexpected format.
2665            # proxies already set up to be empty so nothing to do
2666            pass
2667        return proxies
2668
2669    def getproxies():
2670        """Return a dictionary of scheme -> proxy server URL mappings.
2671
2672        Returns settings gathered from the environment, if specified,
2673        or the registry.
2674
2675        """
2676        return getproxies_environment() or getproxies_registry()
2677
2678    def proxy_bypass_registry(host):
2679        try:
2680            import winreg
2681        except ImportError:
2682            # Std modules, so should be around - but you never know!
2683            return 0
2684        try:
2685            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2686                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2687            proxyEnable = winreg.QueryValueEx(internetSettings,
2688                                               'ProxyEnable')[0]
2689            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2690                                                     'ProxyOverride')[0])
2691            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2692        except OSError:
2693            return 0
2694        if not proxyEnable or not proxyOverride:
2695            return 0
2696        # try to make a host list from name and IP address.
2697        rawHost, port = splitport(host)
2698        host = [rawHost]
2699        try:
2700            addr = socket.gethostbyname(rawHost)
2701            if addr != rawHost:
2702                host.append(addr)
2703        except OSError:
2704            pass
2705        try:
2706            fqdn = socket.getfqdn(rawHost)
2707            if fqdn != rawHost:
2708                host.append(fqdn)
2709        except OSError:
2710            pass
2711        # make a check value list from the registry entry: replace the
2712        # '<local>' string by the localhost entry and the corresponding
2713        # canonical entry.
2714        proxyOverride = proxyOverride.split(';')
2715        # now check if we match one of the registry values.
2716        for test in proxyOverride:
2717            if test == '<local>':
2718                if '.' not in rawHost:
2719                    return 1
2720            test = test.replace(".", r"\.")     # mask dots
2721            test = test.replace("*", r".*")     # change glob sequence
2722            test = test.replace("?", r".")      # change glob char
2723            for val in host:
2724                if re.match(test, val, re.I):
2725                    return 1
2726        return 0
2727
2728    def proxy_bypass(host):
2729        """Return True, if host should be bypassed.
2730
2731        Checks proxy settings gathered from the environment, if specified,
2732        or the registry.
2733
2734        """
2735        proxies = getproxies_environment()
2736        if proxies:
2737            return proxy_bypass_environment(host, proxies)
2738        else:
2739            return proxy_bypass_registry(host)
2740
2741else:
2742    # By default use environment variables
2743    getproxies = getproxies_environment
2744    proxy_bypass = proxy_bypass_environment
2745