• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below).  It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work.  Each Handler implements a particular protocol or
10option.  The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL.  For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns.  The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib.  pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back.  One difference is that you can also pass
20a Request instance instead of URL.  Raises a URLError (subclass of
21OSError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers.  Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate.  If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request.  The
38state can be as simple as the URL.  It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43internals:
44BaseHandler and parent
45_call_chain conventions
46
47Example usage:
48
49import urllib.request
50
51# set up authentication info
52authinfo = urllib.request.HTTPBasicAuthHandler()
53authinfo.add_password(realm='PDQ Application',
54                      uri='https://mahler:8092/site-updates.py',
55                      user='klem',
56                      passwd='geheim$parole')
57
58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"})
59
60# build a new opener that adds authentication and caching FTP handlers
61opener = urllib.request.build_opener(proxy_support, authinfo,
62                                     urllib.request.CacheFTPHandler)
63
64# install it
65urllib.request.install_opener(opener)
66
67f = urllib.request.urlopen('http://www.python.org/')
68"""
69
70# XXX issues:
71# If an authentication error handler that tries to perform
72# authentication for some reason but fails, how should the error be
73# signalled?  The client needs to know the HTTP error code.  But if
74# the handler knows that the problem was, e.g., that it didn't know
75# that hash algo that requested in the challenge, it would be good to
76# pass that information along to the client, too.
77# ftp errors aren't handled cleanly
78# check digest against correct (i.e. non-apache) implementation
79
80# Possible extensions:
81# complex proxies  XXX not sure what exactly was meant by this
82# abstract factory for opener
83
84import base64
85import bisect
86import email
87import hashlib
88import http.client
89import io
90import os
91import posixpath
92import re
93import socket
94import string
95import sys
96import time
97import tempfile
98import contextlib
99import warnings
100
101
102from urllib.error import URLError, HTTPError, ContentTooShortError
103from urllib.parse import (
104    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
105    splittype, splithost, splitport, splituser, splitpasswd,
106    splitattr, splitquery, splitvalue, splittag, to_bytes,
107    unquote_to_bytes, urlunparse)
108from urllib.response import addinfourl, addclosehook
109
110# check for SSL
111try:
112    import ssl
113except ImportError:
114    _have_ssl = False
115else:
116    _have_ssl = True
117
118__all__ = [
119    # Classes
120    'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler',
121    'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler',
122    'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
123    'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler',
124    'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler',
125    'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler',
126    'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
127    'UnknownHandler', 'HTTPErrorProcessor',
128    # Functions
129    'urlopen', 'install_opener', 'build_opener',
130    'pathname2url', 'url2pathname', 'getproxies',
131    # Legacy interface
132    'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener',
133]
134
135# used in User-Agent header sent
136__version__ = '%d.%d' % sys.version_info[:2]
137
138_opener = None
139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
140            *, cafile=None, capath=None, cadefault=False, context=None):
141    '''Open the URL url, which can be either a string or a Request object.
142
143    *data* must be an object specifying additional data to be sent to
144    the server, or None if no such data is needed.  See Request for
145    details.
146
147    urllib.request module uses HTTP/1.1 and includes a "Connection:close"
148    header in its HTTP requests.
149
150    The optional *timeout* parameter specifies a timeout in seconds for
151    blocking operations like the connection attempt (if not specified, the
152    global default timeout setting will be used). This only works for HTTP,
153    HTTPS and FTP connections.
154
155    If *context* is specified, it must be a ssl.SSLContext instance describing
156    the various SSL options. See HTTPSConnection for more details.
157
158    The optional *cafile* and *capath* parameters specify a set of trusted CA
159    certificates for HTTPS requests. cafile should point to a single file
160    containing a bundle of CA certificates, whereas capath should point to a
161    directory of hashed certificate files. More information can be found in
162    ssl.SSLContext.load_verify_locations().
163
164    The *cadefault* parameter is ignored.
165
166    This function always returns an object which can work as a context
167    manager and has methods such as
168
169    * geturl() - return the URL of the resource retrieved, commonly used to
170      determine if a redirect was followed
171
172    * info() - return the meta-information of the page, such as headers, in the
173      form of an email.message_from_string() instance (see Quick Reference to
174      HTTP Headers)
175
176    * getcode() - return the HTTP status code of the response.  Raises URLError
177      on errors.
178
179    For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
180    object slightly modified. In addition to the three new methods above, the
181    msg attribute contains the same information as the reason attribute ---
182    the reason phrase returned by the server --- instead of the response
183    headers as it is specified in the documentation for HTTPResponse.
184
185    For FTP, file, and data URLs and requests explicitly handled by legacy
186    URLopener and FancyURLopener classes, this function returns a
187    urllib.response.addinfourl object.
188
189    Note that None may be returned if no handler handles the request (though
190    the default installed global OpenerDirector uses UnknownHandler to ensure
191    this never happens).
192
193    In addition, if proxy settings are detected (for example, when a *_proxy
194    environment variable like http_proxy is set), ProxyHandler is default
195    installed and makes sure the requests are handled through the proxy.
196
197    '''
198    global _opener
199    if cafile or capath or cadefault:
200        import warnings
201        warnings.warn("cafile, capath and cadefault are deprecated, use a "
202                      "custom context instead.", DeprecationWarning, 2)
203        if context is not None:
204            raise ValueError(
205                "You can't pass both context and any of cafile, capath, and "
206                "cadefault"
207            )
208        if not _have_ssl:
209            raise ValueError('SSL support not available')
210        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH,
211                                             cafile=cafile,
212                                             capath=capath)
213        https_handler = HTTPSHandler(context=context)
214        opener = build_opener(https_handler)
215    elif context:
216        https_handler = HTTPSHandler(context=context)
217        opener = build_opener(https_handler)
218    elif _opener is None:
219        _opener = opener = build_opener()
220    else:
221        opener = _opener
222    return opener.open(url, data, timeout)
223
224def install_opener(opener):
225    global _opener
226    _opener = opener
227
228_url_tempfiles = []
229def urlretrieve(url, filename=None, reporthook=None, data=None):
230    """
231    Retrieve a URL into a temporary location on disk.
232
233    Requires a URL argument. If a filename is passed, it is used as
234    the temporary file location. The reporthook argument should be
235    a callable that accepts a block number, a read size, and the
236    total file size of the URL target. The data argument should be
237    valid URL encoded data.
238
239    If a filename is passed and the URL points to a local resource,
240    the result is a copy from local file to new file.
241
242    Returns a tuple containing the path to the newly created
243    data file as well as the resulting HTTPMessage object.
244    """
245    url_type, path = splittype(url)
246
247    with contextlib.closing(urlopen(url, data)) as fp:
248        headers = fp.info()
249
250        # Just return the local path and the "headers" for file://
251        # URLs. No sense in performing a copy unless requested.
252        if url_type == "file" and not filename:
253            return os.path.normpath(path), headers
254
255        # Handle temporary file setup.
256        if filename:
257            tfp = open(filename, 'wb')
258        else:
259            tfp = tempfile.NamedTemporaryFile(delete=False)
260            filename = tfp.name
261            _url_tempfiles.append(filename)
262
263        with tfp:
264            result = filename, headers
265            bs = 1024*8
266            size = -1
267            read = 0
268            blocknum = 0
269            if "content-length" in headers:
270                size = int(headers["Content-Length"])
271
272            if reporthook:
273                reporthook(blocknum, bs, size)
274
275            while True:
276                block = fp.read(bs)
277                if not block:
278                    break
279                read += len(block)
280                tfp.write(block)
281                blocknum += 1
282                if reporthook:
283                    reporthook(blocknum, bs, size)
284
285    if size >= 0 and read < size:
286        raise ContentTooShortError(
287            "retrieval incomplete: got only %i out of %i bytes"
288            % (read, size), result)
289
290    return result
291
292def urlcleanup():
293    """Clean up temporary files from urlretrieve calls."""
294    for temp_file in _url_tempfiles:
295        try:
296            os.unlink(temp_file)
297        except OSError:
298            pass
299
300    del _url_tempfiles[:]
301    global _opener
302    if _opener:
303        _opener = None
304
305# copied from cookielib.py
306_cut_port_re = re.compile(r":\d+$", re.ASCII)
307def request_host(request):
308    """Return request-host, as defined by RFC 2965.
309
310    Variation from RFC: returned value is lowercased, for convenient
311    comparison.
312
313    """
314    url = request.full_url
315    host = urlparse(url)[1]
316    if host == "":
317        host = request.get_header("Host", "")
318
319    # remove port, if present
320    host = _cut_port_re.sub("", host, 1)
321    return host.lower()
322
323class Request:
324
325    def __init__(self, url, data=None, headers={},
326                 origin_req_host=None, unverifiable=False,
327                 method=None):
328        self.full_url = url
329        self.headers = {}
330        self.unredirected_hdrs = {}
331        self._data = None
332        self.data = data
333        self._tunnel_host = None
334        for key, value in headers.items():
335            self.add_header(key, value)
336        if origin_req_host is None:
337            origin_req_host = request_host(self)
338        self.origin_req_host = origin_req_host
339        self.unverifiable = unverifiable
340        if method:
341            self.method = method
342
343    @property
344    def full_url(self):
345        if self.fragment:
346            return '{}#{}'.format(self._full_url, self.fragment)
347        return self._full_url
348
349    @full_url.setter
350    def full_url(self, url):
351        # unwrap('<URL:type://host/path>') --> 'type://host/path'
352        self._full_url = unwrap(url)
353        self._full_url, self.fragment = splittag(self._full_url)
354        self._parse()
355
356    @full_url.deleter
357    def full_url(self):
358        self._full_url = None
359        self.fragment = None
360        self.selector = ''
361
362    @property
363    def data(self):
364        return self._data
365
366    @data.setter
367    def data(self, data):
368        if data != self._data:
369            self._data = data
370            # issue 16464
371            # if we change data we need to remove content-length header
372            # (cause it's most probably calculated for previous value)
373            if self.has_header("Content-length"):
374                self.remove_header("Content-length")
375
376    @data.deleter
377    def data(self):
378        self.data = None
379
380    def _parse(self):
381        self.type, rest = splittype(self._full_url)
382        if self.type is None:
383            raise ValueError("unknown url type: %r" % self.full_url)
384        self.host, self.selector = splithost(rest)
385        if self.host:
386            self.host = unquote(self.host)
387
388    def get_method(self):
389        """Return a string indicating the HTTP request method."""
390        default_method = "POST" if self.data is not None else "GET"
391        return getattr(self, 'method', default_method)
392
393    def get_full_url(self):
394        return self.full_url
395
396    def set_proxy(self, host, type):
397        if self.type == 'https' and not self._tunnel_host:
398            self._tunnel_host = self.host
399        else:
400            self.type= type
401            self.selector = self.full_url
402        self.host = host
403
404    def has_proxy(self):
405        return self.selector == self.full_url
406
407    def add_header(self, key, val):
408        # useful for something like authentication
409        self.headers[key.capitalize()] = val
410
411    def add_unredirected_header(self, key, val):
412        # will not be added to a redirected request
413        self.unredirected_hdrs[key.capitalize()] = val
414
415    def has_header(self, header_name):
416        return (header_name in self.headers or
417                header_name in self.unredirected_hdrs)
418
419    def get_header(self, header_name, default=None):
420        return self.headers.get(
421            header_name,
422            self.unredirected_hdrs.get(header_name, default))
423
424    def remove_header(self, header_name):
425        self.headers.pop(header_name, None)
426        self.unredirected_hdrs.pop(header_name, None)
427
428    def header_items(self):
429        hdrs = self.unredirected_hdrs.copy()
430        hdrs.update(self.headers)
431        return list(hdrs.items())
432
433class OpenerDirector:
434    def __init__(self):
435        client_version = "Python-urllib/%s" % __version__
436        self.addheaders = [('User-agent', client_version)]
437        # self.handlers is retained only for backward compatibility
438        self.handlers = []
439        # manage the individual handlers
440        self.handle_open = {}
441        self.handle_error = {}
442        self.process_response = {}
443        self.process_request = {}
444
445    def add_handler(self, handler):
446        if not hasattr(handler, "add_parent"):
447            raise TypeError("expected BaseHandler instance, got %r" %
448                            type(handler))
449
450        added = False
451        for meth in dir(handler):
452            if meth in ["redirect_request", "do_open", "proxy_open"]:
453                # oops, coincidental match
454                continue
455
456            i = meth.find("_")
457            protocol = meth[:i]
458            condition = meth[i+1:]
459
460            if condition.startswith("error"):
461                j = condition.find("_") + i + 1
462                kind = meth[j+1:]
463                try:
464                    kind = int(kind)
465                except ValueError:
466                    pass
467                lookup = self.handle_error.get(protocol, {})
468                self.handle_error[protocol] = lookup
469            elif condition == "open":
470                kind = protocol
471                lookup = self.handle_open
472            elif condition == "response":
473                kind = protocol
474                lookup = self.process_response
475            elif condition == "request":
476                kind = protocol
477                lookup = self.process_request
478            else:
479                continue
480
481            handlers = lookup.setdefault(kind, [])
482            if handlers:
483                bisect.insort(handlers, handler)
484            else:
485                handlers.append(handler)
486            added = True
487
488        if added:
489            bisect.insort(self.handlers, handler)
490            handler.add_parent(self)
491
492    def close(self):
493        # Only exists for backwards compatibility.
494        pass
495
496    def _call_chain(self, chain, kind, meth_name, *args):
497        # Handlers raise an exception if no one else should try to handle
498        # the request, or return None if they can't but another handler
499        # could.  Otherwise, they return the response.
500        handlers = chain.get(kind, ())
501        for handler in handlers:
502            func = getattr(handler, meth_name)
503            result = func(*args)
504            if result is not None:
505                return result
506
507    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
508        # accept a URL or a Request object
509        if isinstance(fullurl, str):
510            req = Request(fullurl, data)
511        else:
512            req = fullurl
513            if data is not None:
514                req.data = data
515
516        req.timeout = timeout
517        protocol = req.type
518
519        # pre-process request
520        meth_name = protocol+"_request"
521        for processor in self.process_request.get(protocol, []):
522            meth = getattr(processor, meth_name)
523            req = meth(req)
524
525        response = self._open(req, data)
526
527        # post-process response
528        meth_name = protocol+"_response"
529        for processor in self.process_response.get(protocol, []):
530            meth = getattr(processor, meth_name)
531            response = meth(req, response)
532
533        return response
534
535    def _open(self, req, data=None):
536        result = self._call_chain(self.handle_open, 'default',
537                                  'default_open', req)
538        if result:
539            return result
540
541        protocol = req.type
542        result = self._call_chain(self.handle_open, protocol, protocol +
543                                  '_open', req)
544        if result:
545            return result
546
547        return self._call_chain(self.handle_open, 'unknown',
548                                'unknown_open', req)
549
550    def error(self, proto, *args):
551        if proto in ('http', 'https'):
552            # XXX http[s] protocols are special-cased
553            dict = self.handle_error['http'] # https is not different than http
554            proto = args[2]  # YUCK!
555            meth_name = 'http_error_%s' % proto
556            http_err = 1
557            orig_args = args
558        else:
559            dict = self.handle_error
560            meth_name = proto + '_error'
561            http_err = 0
562        args = (dict, proto, meth_name) + args
563        result = self._call_chain(*args)
564        if result:
565            return result
566
567        if http_err:
568            args = (dict, 'default', 'http_error_default') + orig_args
569            return self._call_chain(*args)
570
571# XXX probably also want an abstract factory that knows when it makes
572# sense to skip a superclass in favor of a subclass and when it might
573# make sense to include both
574
575def build_opener(*handlers):
576    """Create an opener object from a list of handlers.
577
578    The opener will use several default handlers, including support
579    for HTTP, FTP and when applicable HTTPS.
580
581    If any of the handlers passed as arguments are subclasses of the
582    default handlers, the default handlers will not be used.
583    """
584    opener = OpenerDirector()
585    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
586                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
587                       FTPHandler, FileHandler, HTTPErrorProcessor,
588                       DataHandler]
589    if hasattr(http.client, "HTTPSConnection"):
590        default_classes.append(HTTPSHandler)
591    skip = set()
592    for klass in default_classes:
593        for check in handlers:
594            if isinstance(check, type):
595                if issubclass(check, klass):
596                    skip.add(klass)
597            elif isinstance(check, klass):
598                skip.add(klass)
599    for klass in skip:
600        default_classes.remove(klass)
601
602    for klass in default_classes:
603        opener.add_handler(klass())
604
605    for h in handlers:
606        if isinstance(h, type):
607            h = h()
608        opener.add_handler(h)
609    return opener
610
611class BaseHandler:
612    handler_order = 500
613
614    def add_parent(self, parent):
615        self.parent = parent
616
617    def close(self):
618        # Only exists for backwards compatibility
619        pass
620
621    def __lt__(self, other):
622        if not hasattr(other, "handler_order"):
623            # Try to preserve the old behavior of having custom classes
624            # inserted after default ones (works only for custom user
625            # classes which are not aware of handler_order).
626            return True
627        return self.handler_order < other.handler_order
628
629
630class HTTPErrorProcessor(BaseHandler):
631    """Process HTTP error responses."""
632    handler_order = 1000  # after all other processing
633
634    def http_response(self, request, response):
635        code, msg, hdrs = response.code, response.msg, response.info()
636
637        # According to RFC 2616, "2xx" code indicates that the client's
638        # request was successfully received, understood, and accepted.
639        if not (200 <= code < 300):
640            response = self.parent.error(
641                'http', request, response, code, msg, hdrs)
642
643        return response
644
645    https_response = http_response
646
647class HTTPDefaultErrorHandler(BaseHandler):
648    def http_error_default(self, req, fp, code, msg, hdrs):
649        raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651class HTTPRedirectHandler(BaseHandler):
652    # maximum number of redirections to any single URL
653    # this is needed because of the state that cookies introduce
654    max_repeats = 4
655    # maximum total number of redirections (regardless of URL) before
656    # assuming we're in a loop
657    max_redirections = 10
658
659    def redirect_request(self, req, fp, code, msg, headers, newurl):
660        """Return a Request or None in response to a redirect.
661
662        This is called by the http_error_30x methods when a
663        redirection response is received.  If a redirection should
664        take place, return a new Request to allow http_error_30x to
665        perform the redirect.  Otherwise, raise HTTPError if no-one
666        else should try to handle this url.  Return None if you can't
667        but another Handler might.
668        """
669        m = req.get_method()
670        if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
671            or code in (301, 302, 303) and m == "POST")):
672            raise HTTPError(req.full_url, code, msg, headers, fp)
673
674        # Strictly (according to RFC 2616), 301 or 302 in response to
675        # a POST MUST NOT cause a redirection without confirmation
676        # from the user (of urllib.request, in this case).  In practice,
677        # essentially all clients do redirect in this case, so we do
678        # the same.
679
680        # Be conciliant with URIs containing a space.  This is mainly
681        # redundant with the more complete encoding done in http_error_302(),
682        # but it is kept for compatibility with other callers.
683        newurl = newurl.replace(' ', '%20')
684
685        CONTENT_HEADERS = ("content-length", "content-type")
686        newheaders = {k: v for k, v in req.headers.items()
687                      if k.lower() not in CONTENT_HEADERS}
688        return Request(newurl,
689                       headers=newheaders,
690                       origin_req_host=req.origin_req_host,
691                       unverifiable=True)
692
693    # Implementation note: To avoid the server sending us into an
694    # infinite loop, the request object needs to track what URLs we
695    # have already seen.  Do this by adding a handler-specific
696    # attribute to the Request object.
697    def http_error_302(self, req, fp, code, msg, headers):
698        # Some servers (incorrectly) return multiple Location headers
699        # (so probably same goes for URI).  Use first header.
700        if "location" in headers:
701            newurl = headers["location"]
702        elif "uri" in headers:
703            newurl = headers["uri"]
704        else:
705            return
706
707        # fix a possible malformed URL
708        urlparts = urlparse(newurl)
709
710        # For security reasons we don't allow redirection to anything other
711        # than http, https or ftp.
712
713        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
714            raise HTTPError(
715                newurl, code,
716                "%s - Redirection to url '%s' is not allowed" % (msg, newurl),
717                headers, fp)
718
719        if not urlparts.path and urlparts.netloc:
720            urlparts = list(urlparts)
721            urlparts[2] = "/"
722        newurl = urlunparse(urlparts)
723
724        # http.client.parse_headers() decodes as ISO-8859-1.  Recover the
725        # original bytes and percent-encode non-ASCII bytes, and any special
726        # characters such as the space.
727        newurl = quote(
728            newurl, encoding="iso-8859-1", safe=string.punctuation)
729        newurl = urljoin(req.full_url, newurl)
730
731        # XXX Probably want to forget about the state of the current
732        # request, although that might interact poorly with other
733        # handlers that also use handler-specific request attributes
734        new = self.redirect_request(req, fp, code, msg, headers, newurl)
735        if new is None:
736            return
737
738        # loop detection
739        # .redirect_dict has a key url if url was previously visited.
740        if hasattr(req, 'redirect_dict'):
741            visited = new.redirect_dict = req.redirect_dict
742            if (visited.get(newurl, 0) >= self.max_repeats or
743                len(visited) >= self.max_redirections):
744                raise HTTPError(req.full_url, code,
745                                self.inf_msg + msg, headers, fp)
746        else:
747            visited = new.redirect_dict = req.redirect_dict = {}
748        visited[newurl] = visited.get(newurl, 0) + 1
749
750        # Don't close the fp until we are sure that we won't use it
751        # with HTTPError.
752        fp.read()
753        fp.close()
754
755        return self.parent.open(new, timeout=req.timeout)
756
757    http_error_301 = http_error_303 = http_error_307 = http_error_302
758
759    inf_msg = "The HTTP server returned a redirect error that would " \
760              "lead to an infinite loop.\n" \
761              "The last 30x error message was:\n"
762
763
764def _parse_proxy(proxy):
765    """Return (scheme, user, password, host/port) given a URL or an authority.
766
767    If a URL is supplied, it must have an authority (host:port) component.
768    According to RFC 3986, having an authority component means the URL must
769    have two slashes after the scheme.
770    """
771    scheme, r_scheme = splittype(proxy)
772    if not r_scheme.startswith("/"):
773        # authority
774        scheme = None
775        authority = proxy
776    else:
777        # URL
778        if not r_scheme.startswith("//"):
779            raise ValueError("proxy URL with no authority: %r" % proxy)
780        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
781        # and 3.3.), path is empty or starts with '/'
782        end = r_scheme.find("/", 2)
783        if end == -1:
784            end = None
785        authority = r_scheme[2:end]
786    userinfo, hostport = splituser(authority)
787    if userinfo is not None:
788        user, password = splitpasswd(userinfo)
789    else:
790        user = password = None
791    return scheme, user, password, hostport
792
793class ProxyHandler(BaseHandler):
794    # Proxies must be in front
795    handler_order = 100
796
797    def __init__(self, proxies=None):
798        if proxies is None:
799            proxies = getproxies()
800        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
801        self.proxies = proxies
802        for type, url in proxies.items():
803            setattr(self, '%s_open' % type,
804                    lambda r, proxy=url, type=type, meth=self.proxy_open:
805                        meth(r, proxy, type))
806
807    def proxy_open(self, req, proxy, type):
808        orig_type = req.type
809        proxy_type, user, password, hostport = _parse_proxy(proxy)
810        if proxy_type is None:
811            proxy_type = orig_type
812
813        if req.host and proxy_bypass(req.host):
814            return None
815
816        if user and password:
817            user_pass = '%s:%s' % (unquote(user),
818                                   unquote(password))
819            creds = base64.b64encode(user_pass.encode()).decode("ascii")
820            req.add_header('Proxy-authorization', 'Basic ' + creds)
821        hostport = unquote(hostport)
822        req.set_proxy(hostport, proxy_type)
823        if orig_type == proxy_type or orig_type == 'https':
824            # let other handlers take care of it
825            return None
826        else:
827            # need to start over, because the other handlers don't
828            # grok the proxy's URL type
829            # e.g. if we have a constructor arg proxies like so:
830            # {'http': 'ftp://proxy.example.com'}, we may end up turning
831            # a request for http://acme.example.com/a into one for
832            # ftp://proxy.example.com/a
833            return self.parent.open(req, timeout=req.timeout)
834
835class HTTPPasswordMgr:
836
837    def __init__(self):
838        self.passwd = {}
839
840    def add_password(self, realm, uri, user, passwd):
841        # uri could be a single URI or a sequence
842        if isinstance(uri, str):
843            uri = [uri]
844        if realm not in self.passwd:
845            self.passwd[realm] = {}
846        for default_port in True, False:
847            reduced_uri = tuple(
848                self.reduce_uri(u, default_port) for u in uri)
849            self.passwd[realm][reduced_uri] = (user, passwd)
850
851    def find_user_password(self, realm, authuri):
852        domains = self.passwd.get(realm, {})
853        for default_port in True, False:
854            reduced_authuri = self.reduce_uri(authuri, default_port)
855            for uris, authinfo in domains.items():
856                for uri in uris:
857                    if self.is_suburi(uri, reduced_authuri):
858                        return authinfo
859        return None, None
860
861    def reduce_uri(self, uri, default_port=True):
862        """Accept authority or URI and extract only the authority and path."""
863        # note HTTP URLs do not have a userinfo component
864        parts = urlsplit(uri)
865        if parts[1]:
866            # URI
867            scheme = parts[0]
868            authority = parts[1]
869            path = parts[2] or '/'
870        else:
871            # host or host:port
872            scheme = None
873            authority = uri
874            path = '/'
875        host, port = splitport(authority)
876        if default_port and port is None and scheme is not None:
877            dport = {"http": 80,
878                     "https": 443,
879                     }.get(scheme)
880            if dport is not None:
881                authority = "%s:%d" % (host, dport)
882        return authority, path
883
884    def is_suburi(self, base, test):
885        """Check if test is below base in a URI tree
886
887        Both args must be URIs in reduced form.
888        """
889        if base == test:
890            return True
891        if base[0] != test[0]:
892            return False
893        common = posixpath.commonprefix((base[1], test[1]))
894        if len(common) == len(base[1]):
895            return True
896        return False
897
898
899class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
900
901    def find_user_password(self, realm, authuri):
902        user, password = HTTPPasswordMgr.find_user_password(self, realm,
903                                                            authuri)
904        if user is not None:
905            return user, password
906        return HTTPPasswordMgr.find_user_password(self, None, authuri)
907
908
909class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm):
910
911    def __init__(self, *args, **kwargs):
912        self.authenticated = {}
913        super().__init__(*args, **kwargs)
914
915    def add_password(self, realm, uri, user, passwd, is_authenticated=False):
916        self.update_authenticated(uri, is_authenticated)
917        # Add a default for prior auth requests
918        if realm is not None:
919            super().add_password(None, uri, user, passwd)
920        super().add_password(realm, uri, user, passwd)
921
922    def update_authenticated(self, uri, is_authenticated=False):
923        # uri could be a single URI or a sequence
924        if isinstance(uri, str):
925            uri = [uri]
926
927        for default_port in True, False:
928            for u in uri:
929                reduced_uri = self.reduce_uri(u, default_port)
930                self.authenticated[reduced_uri] = is_authenticated
931
932    def is_authenticated(self, authuri):
933        for default_port in True, False:
934            reduced_authuri = self.reduce_uri(authuri, default_port)
935            for uri in self.authenticated:
936                if self.is_suburi(uri, reduced_authuri):
937                    return self.authenticated[uri]
938
939
940class AbstractBasicAuthHandler:
941
942    # XXX this allows for multiple auth-schemes, but will stupidly pick
943    # the last one with a realm specified.
944
945    # allow for double- and single-quoted realm values
946    # (single quotes are a violation of the RFC, but appear in the wild)
947    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
948                    'realm=(["\']?)([^"\']*)\\2', re.I)
949
950    # XXX could pre-emptively send auth info already accepted (RFC 2617,
951    # end of section 2, and section 1.2 immediately after "credentials"
952    # production).
953
954    def __init__(self, password_mgr=None):
955        if password_mgr is None:
956            password_mgr = HTTPPasswordMgr()
957        self.passwd = password_mgr
958        self.add_password = self.passwd.add_password
959
960    def http_error_auth_reqed(self, authreq, host, req, headers):
961        # host may be an authority (without userinfo) or a URL with an
962        # authority
963        # XXX could be multiple headers
964        authreq = headers.get(authreq, None)
965
966        if authreq:
967            scheme = authreq.split()[0]
968            if scheme.lower() != 'basic':
969                raise ValueError("AbstractBasicAuthHandler does not"
970                                 " support the following scheme: '%s'" %
971                                 scheme)
972            else:
973                mo = AbstractBasicAuthHandler.rx.search(authreq)
974                if mo:
975                    scheme, quote, realm = mo.groups()
976                    if quote not in ['"',"'"]:
977                        warnings.warn("Basic Auth Realm was unquoted",
978                                      UserWarning, 2)
979                    if scheme.lower() == 'basic':
980                        return self.retry_http_basic_auth(host, req, realm)
981
982    def retry_http_basic_auth(self, host, req, realm):
983        user, pw = self.passwd.find_user_password(realm, host)
984        if pw is not None:
985            raw = "%s:%s" % (user, pw)
986            auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii")
987            if req.get_header(self.auth_header, None) == auth:
988                return None
989            req.add_unredirected_header(self.auth_header, auth)
990            return self.parent.open(req, timeout=req.timeout)
991        else:
992            return None
993
994    def http_request(self, req):
995        if (not hasattr(self.passwd, 'is_authenticated') or
996           not self.passwd.is_authenticated(req.full_url)):
997            return req
998
999        if not req.has_header('Authorization'):
1000            user, passwd = self.passwd.find_user_password(None, req.full_url)
1001            credentials = '{0}:{1}'.format(user, passwd).encode()
1002            auth_str = base64.standard_b64encode(credentials).decode()
1003            req.add_unredirected_header('Authorization',
1004                                        'Basic {}'.format(auth_str.strip()))
1005        return req
1006
1007    def http_response(self, req, response):
1008        if hasattr(self.passwd, 'is_authenticated'):
1009            if 200 <= response.code < 300:
1010                self.passwd.update_authenticated(req.full_url, True)
1011            else:
1012                self.passwd.update_authenticated(req.full_url, False)
1013        return response
1014
1015    https_request = http_request
1016    https_response = http_response
1017
1018
1019
1020class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1021
1022    auth_header = 'Authorization'
1023
1024    def http_error_401(self, req, fp, code, msg, headers):
1025        url = req.full_url
1026        response = self.http_error_auth_reqed('www-authenticate',
1027                                          url, req, headers)
1028        return response
1029
1030
1031class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
1032
1033    auth_header = 'Proxy-authorization'
1034
1035    def http_error_407(self, req, fp, code, msg, headers):
1036        # http_error_auth_reqed requires that there is no userinfo component in
1037        # authority.  Assume there isn't one, since urllib.request does not (and
1038        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
1039        # userinfo.
1040        authority = req.host
1041        response = self.http_error_auth_reqed('proxy-authenticate',
1042                                          authority, req, headers)
1043        return response
1044
1045
1046# Return n random bytes.
1047_randombytes = os.urandom
1048
1049
1050class AbstractDigestAuthHandler:
1051    # Digest authentication is specified in RFC 2617.
1052
1053    # XXX The client does not inspect the Authentication-Info header
1054    # in a successful response.
1055
1056    # XXX It should be possible to test this implementation against
1057    # a mock server that just generates a static set of challenges.
1058
1059    # XXX qop="auth-int" supports is shaky
1060
1061    def __init__(self, passwd=None):
1062        if passwd is None:
1063            passwd = HTTPPasswordMgr()
1064        self.passwd = passwd
1065        self.add_password = self.passwd.add_password
1066        self.retried = 0
1067        self.nonce_count = 0
1068        self.last_nonce = None
1069
1070    def reset_retry_count(self):
1071        self.retried = 0
1072
1073    def http_error_auth_reqed(self, auth_header, host, req, headers):
1074        authreq = headers.get(auth_header, None)
1075        if self.retried > 5:
1076            # Don't fail endlessly - if we failed once, we'll probably
1077            # fail a second time. Hm. Unless the Password Manager is
1078            # prompting for the information. Crap. This isn't great
1079            # but it's better than the current 'repeat until recursion
1080            # depth exceeded' approach <wink>
1081            raise HTTPError(req.full_url, 401, "digest auth failed",
1082                            headers, None)
1083        else:
1084            self.retried += 1
1085        if authreq:
1086            scheme = authreq.split()[0]
1087            if scheme.lower() == 'digest':
1088                return self.retry_http_digest_auth(req, authreq)
1089            elif scheme.lower() != 'basic':
1090                raise ValueError("AbstractDigestAuthHandler does not support"
1091                                 " the following scheme: '%s'" % scheme)
1092
1093    def retry_http_digest_auth(self, req, auth):
1094        token, challenge = auth.split(' ', 1)
1095        chal = parse_keqv_list(filter(None, parse_http_list(challenge)))
1096        auth = self.get_authorization(req, chal)
1097        if auth:
1098            auth_val = 'Digest %s' % auth
1099            if req.headers.get(self.auth_header, None) == auth_val:
1100                return None
1101            req.add_unredirected_header(self.auth_header, auth_val)
1102            resp = self.parent.open(req, timeout=req.timeout)
1103            return resp
1104
1105    def get_cnonce(self, nonce):
1106        # The cnonce-value is an opaque
1107        # quoted string value provided by the client and used by both client
1108        # and server to avoid chosen plaintext attacks, to provide mutual
1109        # authentication, and to provide some message integrity protection.
1110        # This isn't a fabulous effort, but it's probably Good Enough.
1111        s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime())
1112        b = s.encode("ascii") + _randombytes(8)
1113        dig = hashlib.sha1(b).hexdigest()
1114        return dig[:16]
1115
1116    def get_authorization(self, req, chal):
1117        try:
1118            realm = chal['realm']
1119            nonce = chal['nonce']
1120            qop = chal.get('qop')
1121            algorithm = chal.get('algorithm', 'MD5')
1122            # mod_digest doesn't send an opaque, even though it isn't
1123            # supposed to be optional
1124            opaque = chal.get('opaque', None)
1125        except KeyError:
1126            return None
1127
1128        H, KD = self.get_algorithm_impls(algorithm)
1129        if H is None:
1130            return None
1131
1132        user, pw = self.passwd.find_user_password(realm, req.full_url)
1133        if user is None:
1134            return None
1135
1136        # XXX not implemented yet
1137        if req.data is not None:
1138            entdig = self.get_entity_digest(req.data, chal)
1139        else:
1140            entdig = None
1141
1142        A1 = "%s:%s:%s" % (user, realm, pw)
1143        A2 = "%s:%s" % (req.get_method(),
1144                        # XXX selector: what about proxies and full urls
1145                        req.selector)
1146        if qop == 'auth':
1147            if nonce == self.last_nonce:
1148                self.nonce_count += 1
1149            else:
1150                self.nonce_count = 1
1151                self.last_nonce = nonce
1152            ncvalue = '%08x' % self.nonce_count
1153            cnonce = self.get_cnonce(nonce)
1154            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1155            respdig = KD(H(A1), noncebit)
1156        elif qop is None:
1157            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1158        else:
1159            # XXX handle auth-int.
1160            raise URLError("qop '%s' is not supported." % qop)
1161
1162        # XXX should the partial digests be encoded too?
1163
1164        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1165               'response="%s"' % (user, realm, nonce, req.selector,
1166                                  respdig)
1167        if opaque:
1168            base += ', opaque="%s"' % opaque
1169        if entdig:
1170            base += ', digest="%s"' % entdig
1171        base += ', algorithm="%s"' % algorithm
1172        if qop:
1173            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1174        return base
1175
1176    def get_algorithm_impls(self, algorithm):
1177        # lambdas assume digest modules are imported at the top level
1178        if algorithm == 'MD5':
1179            H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest()
1180        elif algorithm == 'SHA':
1181            H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest()
1182        # XXX MD5-sess
1183        else:
1184            raise ValueError("Unsupported digest authentication "
1185                             "algorithm %r" % algorithm)
1186        KD = lambda s, d: H("%s:%s" % (s, d))
1187        return H, KD
1188
1189    def get_entity_digest(self, data, chal):
1190        # XXX not implemented yet
1191        return None
1192
1193
1194class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1195    """An authentication protocol defined by RFC 2069
1196
1197    Digest authentication improves on basic authentication because it
1198    does not transmit passwords in the clear.
1199    """
1200
1201    auth_header = 'Authorization'
1202    handler_order = 490  # before Basic auth
1203
1204    def http_error_401(self, req, fp, code, msg, headers):
1205        host = urlparse(req.full_url)[1]
1206        retry = self.http_error_auth_reqed('www-authenticate',
1207                                           host, req, headers)
1208        self.reset_retry_count()
1209        return retry
1210
1211
1212class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1213
1214    auth_header = 'Proxy-Authorization'
1215    handler_order = 490  # before Basic auth
1216
1217    def http_error_407(self, req, fp, code, msg, headers):
1218        host = req.host
1219        retry = self.http_error_auth_reqed('proxy-authenticate',
1220                                           host, req, headers)
1221        self.reset_retry_count()
1222        return retry
1223
1224class AbstractHTTPHandler(BaseHandler):
1225
1226    def __init__(self, debuglevel=0):
1227        self._debuglevel = debuglevel
1228
1229    def set_http_debuglevel(self, level):
1230        self._debuglevel = level
1231
1232    def _get_content_length(self, request):
1233        return http.client.HTTPConnection._get_content_length(
1234            request.data,
1235            request.get_method())
1236
1237    def do_request_(self, request):
1238        host = request.host
1239        if not host:
1240            raise URLError('no host given')
1241
1242        if request.data is not None:  # POST
1243            data = request.data
1244            if isinstance(data, str):
1245                msg = "POST data should be bytes, an iterable of bytes, " \
1246                      "or a file object. It cannot be of type str."
1247                raise TypeError(msg)
1248            if not request.has_header('Content-type'):
1249                request.add_unredirected_header(
1250                    'Content-type',
1251                    'application/x-www-form-urlencoded')
1252            if (not request.has_header('Content-length')
1253                    and not request.has_header('Transfer-encoding')):
1254                content_length = self._get_content_length(request)
1255                if content_length is not None:
1256                    request.add_unredirected_header(
1257                            'Content-length', str(content_length))
1258                else:
1259                    request.add_unredirected_header(
1260                            'Transfer-encoding', 'chunked')
1261
1262        sel_host = host
1263        if request.has_proxy():
1264            scheme, sel = splittype(request.selector)
1265            sel_host, sel_path = splithost(sel)
1266        if not request.has_header('Host'):
1267            request.add_unredirected_header('Host', sel_host)
1268        for name, value in self.parent.addheaders:
1269            name = name.capitalize()
1270            if not request.has_header(name):
1271                request.add_unredirected_header(name, value)
1272
1273        return request
1274
1275    def do_open(self, http_class, req, **http_conn_args):
1276        """Return an HTTPResponse object for the request, using http_class.
1277
1278        http_class must implement the HTTPConnection API from http.client.
1279        """
1280        host = req.host
1281        if not host:
1282            raise URLError('no host given')
1283
1284        # will parse host:port
1285        h = http_class(host, timeout=req.timeout, **http_conn_args)
1286        h.set_debuglevel(self._debuglevel)
1287
1288        headers = dict(req.unredirected_hdrs)
1289        headers.update({k: v for k, v in req.headers.items()
1290                        if k not in headers})
1291
1292        # TODO(jhylton): Should this be redesigned to handle
1293        # persistent connections?
1294
1295        # We want to make an HTTP/1.1 request, but the addinfourl
1296        # class isn't prepared to deal with a persistent connection.
1297        # It will try to read all remaining data from the socket,
1298        # which will block while the server waits for the next request.
1299        # So make sure the connection gets closed after the (only)
1300        # request.
1301        headers["Connection"] = "close"
1302        headers = {name.title(): val for name, val in headers.items()}
1303
1304        if req._tunnel_host:
1305            tunnel_headers = {}
1306            proxy_auth_hdr = "Proxy-Authorization"
1307            if proxy_auth_hdr in headers:
1308                tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1309                # Proxy-Authorization should not be sent to origin
1310                # server.
1311                del headers[proxy_auth_hdr]
1312            h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1313
1314        try:
1315            try:
1316                h.request(req.get_method(), req.selector, req.data, headers,
1317                          encode_chunked=req.has_header('Transfer-encoding'))
1318            except OSError as err: # timeout error
1319                raise URLError(err)
1320            r = h.getresponse()
1321        except:
1322            h.close()
1323            raise
1324
1325        # If the server does not send us a 'Connection: close' header,
1326        # HTTPConnection assumes the socket should be left open. Manually
1327        # mark the socket to be closed when this response object goes away.
1328        if h.sock:
1329            h.sock.close()
1330            h.sock = None
1331
1332        r.url = req.get_full_url()
1333        # This line replaces the .msg attribute of the HTTPResponse
1334        # with .headers, because urllib clients expect the response to
1335        # have the reason in .msg.  It would be good to mark this
1336        # attribute is deprecated and get then to use info() or
1337        # .headers.
1338        r.msg = r.reason
1339        return r
1340
1341
1342class HTTPHandler(AbstractHTTPHandler):
1343
1344    def http_open(self, req):
1345        return self.do_open(http.client.HTTPConnection, req)
1346
1347    http_request = AbstractHTTPHandler.do_request_
1348
1349if hasattr(http.client, 'HTTPSConnection'):
1350
1351    class HTTPSHandler(AbstractHTTPHandler):
1352
1353        def __init__(self, debuglevel=0, context=None, check_hostname=None):
1354            AbstractHTTPHandler.__init__(self, debuglevel)
1355            self._context = context
1356            self._check_hostname = check_hostname
1357
1358        def https_open(self, req):
1359            return self.do_open(http.client.HTTPSConnection, req,
1360                context=self._context, check_hostname=self._check_hostname)
1361
1362        https_request = AbstractHTTPHandler.do_request_
1363
1364    __all__.append('HTTPSHandler')
1365
1366class HTTPCookieProcessor(BaseHandler):
1367    def __init__(self, cookiejar=None):
1368        import http.cookiejar
1369        if cookiejar is None:
1370            cookiejar = http.cookiejar.CookieJar()
1371        self.cookiejar = cookiejar
1372
1373    def http_request(self, request):
1374        self.cookiejar.add_cookie_header(request)
1375        return request
1376
1377    def http_response(self, request, response):
1378        self.cookiejar.extract_cookies(response, request)
1379        return response
1380
1381    https_request = http_request
1382    https_response = http_response
1383
1384class UnknownHandler(BaseHandler):
1385    def unknown_open(self, req):
1386        type = req.type
1387        raise URLError('unknown url type: %s' % type)
1388
1389def parse_keqv_list(l):
1390    """Parse list of key=value strings where keys are not duplicated."""
1391    parsed = {}
1392    for elt in l:
1393        k, v = elt.split('=', 1)
1394        if v[0] == '"' and v[-1] == '"':
1395            v = v[1:-1]
1396        parsed[k] = v
1397    return parsed
1398
1399def parse_http_list(s):
1400    """Parse lists as described by RFC 2068 Section 2.
1401
1402    In particular, parse comma-separated lists where the elements of
1403    the list may include quoted-strings.  A quoted-string could
1404    contain a comma.  A non-quoted string could have quotes in the
1405    middle.  Neither commas nor quotes count if they are escaped.
1406    Only double-quotes count, not single-quotes.
1407    """
1408    res = []
1409    part = ''
1410
1411    escape = quote = False
1412    for cur in s:
1413        if escape:
1414            part += cur
1415            escape = False
1416            continue
1417        if quote:
1418            if cur == '\\':
1419                escape = True
1420                continue
1421            elif cur == '"':
1422                quote = False
1423            part += cur
1424            continue
1425
1426        if cur == ',':
1427            res.append(part)
1428            part = ''
1429            continue
1430
1431        if cur == '"':
1432            quote = True
1433
1434        part += cur
1435
1436    # append last part
1437    if part:
1438        res.append(part)
1439
1440    return [part.strip() for part in res]
1441
1442class FileHandler(BaseHandler):
1443    # Use local file or FTP depending on form of URL
1444    def file_open(self, req):
1445        url = req.selector
1446        if url[:2] == '//' and url[2:3] != '/' and (req.host and
1447                req.host != 'localhost'):
1448            if not req.host in self.get_names():
1449                raise URLError("file:// scheme is supported only on localhost")
1450        else:
1451            return self.open_local_file(req)
1452
1453    # names for the localhost
1454    names = None
1455    def get_names(self):
1456        if FileHandler.names is None:
1457            try:
1458                FileHandler.names = tuple(
1459                    socket.gethostbyname_ex('localhost')[2] +
1460                    socket.gethostbyname_ex(socket.gethostname())[2])
1461            except socket.gaierror:
1462                FileHandler.names = (socket.gethostbyname('localhost'),)
1463        return FileHandler.names
1464
1465    # not entirely sure what the rules are here
1466    def open_local_file(self, req):
1467        import email.utils
1468        import mimetypes
1469        host = req.host
1470        filename = req.selector
1471        localfile = url2pathname(filename)
1472        try:
1473            stats = os.stat(localfile)
1474            size = stats.st_size
1475            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1476            mtype = mimetypes.guess_type(filename)[0]
1477            headers = email.message_from_string(
1478                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1479                (mtype or 'text/plain', size, modified))
1480            if host:
1481                host, port = splitport(host)
1482            if not host or \
1483                (not port and _safe_gethostbyname(host) in self.get_names()):
1484                if host:
1485                    origurl = 'file://' + host + filename
1486                else:
1487                    origurl = 'file://' + filename
1488                return addinfourl(open(localfile, 'rb'), headers, origurl)
1489        except OSError as exp:
1490            raise URLError(exp)
1491        raise URLError('file not on local host')
1492
1493def _safe_gethostbyname(host):
1494    try:
1495        return socket.gethostbyname(host)
1496    except socket.gaierror:
1497        return None
1498
1499class FTPHandler(BaseHandler):
1500    def ftp_open(self, req):
1501        import ftplib
1502        import mimetypes
1503        host = req.host
1504        if not host:
1505            raise URLError('ftp error: no host given')
1506        host, port = splitport(host)
1507        if port is None:
1508            port = ftplib.FTP_PORT
1509        else:
1510            port = int(port)
1511
1512        # username/password handling
1513        user, host = splituser(host)
1514        if user:
1515            user, passwd = splitpasswd(user)
1516        else:
1517            passwd = None
1518        host = unquote(host)
1519        user = user or ''
1520        passwd = passwd or ''
1521
1522        try:
1523            host = socket.gethostbyname(host)
1524        except OSError as msg:
1525            raise URLError(msg)
1526        path, attrs = splitattr(req.selector)
1527        dirs = path.split('/')
1528        dirs = list(map(unquote, dirs))
1529        dirs, file = dirs[:-1], dirs[-1]
1530        if dirs and not dirs[0]:
1531            dirs = dirs[1:]
1532        try:
1533            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1534            type = file and 'I' or 'D'
1535            for attr in attrs:
1536                attr, value = splitvalue(attr)
1537                if attr.lower() == 'type' and \
1538                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1539                    type = value.upper()
1540            fp, retrlen = fw.retrfile(file, type)
1541            headers = ""
1542            mtype = mimetypes.guess_type(req.full_url)[0]
1543            if mtype:
1544                headers += "Content-type: %s\n" % mtype
1545            if retrlen is not None and retrlen >= 0:
1546                headers += "Content-length: %d\n" % retrlen
1547            headers = email.message_from_string(headers)
1548            return addinfourl(fp, headers, req.full_url)
1549        except ftplib.all_errors as exp:
1550            exc = URLError('ftp error: %r' % exp)
1551            raise exc.with_traceback(sys.exc_info()[2])
1552
1553    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1554        return ftpwrapper(user, passwd, host, port, dirs, timeout,
1555                          persistent=False)
1556
1557class CacheFTPHandler(FTPHandler):
1558    # XXX would be nice to have pluggable cache strategies
1559    # XXX this stuff is definitely not thread safe
1560    def __init__(self):
1561        self.cache = {}
1562        self.timeout = {}
1563        self.soonest = 0
1564        self.delay = 60
1565        self.max_conns = 16
1566
1567    def setTimeout(self, t):
1568        self.delay = t
1569
1570    def setMaxConns(self, m):
1571        self.max_conns = m
1572
1573    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1574        key = user, host, port, '/'.join(dirs), timeout
1575        if key in self.cache:
1576            self.timeout[key] = time.time() + self.delay
1577        else:
1578            self.cache[key] = ftpwrapper(user, passwd, host, port,
1579                                         dirs, timeout)
1580            self.timeout[key] = time.time() + self.delay
1581        self.check_cache()
1582        return self.cache[key]
1583
1584    def check_cache(self):
1585        # first check for old ones
1586        t = time.time()
1587        if self.soonest <= t:
1588            for k, v in list(self.timeout.items()):
1589                if v < t:
1590                    self.cache[k].close()
1591                    del self.cache[k]
1592                    del self.timeout[k]
1593        self.soonest = min(list(self.timeout.values()))
1594
1595        # then check the size
1596        if len(self.cache) == self.max_conns:
1597            for k, v in list(self.timeout.items()):
1598                if v == self.soonest:
1599                    del self.cache[k]
1600                    del self.timeout[k]
1601                    break
1602            self.soonest = min(list(self.timeout.values()))
1603
1604    def clear_cache(self):
1605        for conn in self.cache.values():
1606            conn.close()
1607        self.cache.clear()
1608        self.timeout.clear()
1609
1610class DataHandler(BaseHandler):
1611    def data_open(self, req):
1612        # data URLs as specified in RFC 2397.
1613        #
1614        # ignores POSTed data
1615        #
1616        # syntax:
1617        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
1618        # mediatype := [ type "/" subtype ] *( ";" parameter )
1619        # data      := *urlchar
1620        # parameter := attribute "=" value
1621        url = req.full_url
1622
1623        scheme, data = url.split(":",1)
1624        mediatype, data = data.split(",",1)
1625
1626        # even base64 encoded data URLs might be quoted so unquote in any case:
1627        data = unquote_to_bytes(data)
1628        if mediatype.endswith(";base64"):
1629            data = base64.decodebytes(data)
1630            mediatype = mediatype[:-7]
1631
1632        if not mediatype:
1633            mediatype = "text/plain;charset=US-ASCII"
1634
1635        headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
1636            (mediatype, len(data)))
1637
1638        return addinfourl(io.BytesIO(data), headers, url)
1639
1640
1641# Code move from the old urllib module
1642
1643MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
1644
1645# Helper for non-unix systems
1646if os.name == 'nt':
1647    from nturl2path import url2pathname, pathname2url
1648else:
1649    def url2pathname(pathname):
1650        """OS-specific conversion from a relative URL of the 'file' scheme
1651        to a file system path; not recommended for general use."""
1652        return unquote(pathname)
1653
1654    def pathname2url(pathname):
1655        """OS-specific conversion from a file system path to a relative URL
1656        of the 'file' scheme; not recommended for general use."""
1657        return quote(pathname)
1658
1659
1660ftpcache = {}
1661
1662
1663class URLopener:
1664    """Class to open URLs.
1665    This is a class rather than just a subroutine because we may need
1666    more than one set of global protocol-specific options.
1667    Note -- this is a base class for those who don't want the
1668    automatic handling of errors type 302 (relocated) and 401
1669    (authorization needed)."""
1670
1671    __tempfiles = None
1672
1673    version = "Python-urllib/%s" % __version__
1674
1675    # Constructor
1676    def __init__(self, proxies=None, **x509):
1677        msg = "%(class)s style of invoking requests is deprecated. " \
1678              "Use newer urlopen functions/methods" % {'class': self.__class__.__name__}
1679        warnings.warn(msg, DeprecationWarning, stacklevel=3)
1680        if proxies is None:
1681            proxies = getproxies()
1682        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
1683        self.proxies = proxies
1684        self.key_file = x509.get('key_file')
1685        self.cert_file = x509.get('cert_file')
1686        self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
1687        self.__tempfiles = []
1688        self.__unlink = os.unlink # See cleanup()
1689        self.tempcache = None
1690        # Undocumented feature: if you assign {} to tempcache,
1691        # it is used to cache files retrieved with
1692        # self.retrieve().  This is not enabled by default
1693        # since it does not work for changing documents (and I
1694        # haven't got the logic to check expiration headers
1695        # yet).
1696        self.ftpcache = ftpcache
1697        # Undocumented feature: you can use a different
1698        # ftp cache by assigning to the .ftpcache member;
1699        # in case you want logically independent URL openers
1700        # XXX This is not threadsafe.  Bah.
1701
1702    def __del__(self):
1703        self.close()
1704
1705    def close(self):
1706        self.cleanup()
1707
1708    def cleanup(self):
1709        # This code sometimes runs when the rest of this module
1710        # has already been deleted, so it can't use any globals
1711        # or import anything.
1712        if self.__tempfiles:
1713            for file in self.__tempfiles:
1714                try:
1715                    self.__unlink(file)
1716                except OSError:
1717                    pass
1718            del self.__tempfiles[:]
1719        if self.tempcache:
1720            self.tempcache.clear()
1721
1722    def addheader(self, *args):
1723        """Add a header to be used by the HTTP interface only
1724        e.g. u.addheader('Accept', 'sound/basic')"""
1725        self.addheaders.append(args)
1726
1727    # External interface
1728    def open(self, fullurl, data=None):
1729        """Use URLopener().open(file) instead of open(file, 'r')."""
1730        fullurl = unwrap(to_bytes(fullurl))
1731        fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
1732        if self.tempcache and fullurl in self.tempcache:
1733            filename, headers = self.tempcache[fullurl]
1734            fp = open(filename, 'rb')
1735            return addinfourl(fp, headers, fullurl)
1736        urltype, url = splittype(fullurl)
1737        if not urltype:
1738            urltype = 'file'
1739        if urltype in self.proxies:
1740            proxy = self.proxies[urltype]
1741            urltype, proxyhost = splittype(proxy)
1742            host, selector = splithost(proxyhost)
1743            url = (host, fullurl) # Signal special case to open_*()
1744        else:
1745            proxy = None
1746        name = 'open_' + urltype
1747        self.type = urltype
1748        name = name.replace('-', '_')
1749        if not hasattr(self, name):
1750            if proxy:
1751                return self.open_unknown_proxy(proxy, fullurl, data)
1752            else:
1753                return self.open_unknown(fullurl, data)
1754        try:
1755            if data is None:
1756                return getattr(self, name)(url)
1757            else:
1758                return getattr(self, name)(url, data)
1759        except (HTTPError, URLError):
1760            raise
1761        except OSError as msg:
1762            raise OSError('socket error', msg).with_traceback(sys.exc_info()[2])
1763
1764    def open_unknown(self, fullurl, data=None):
1765        """Overridable interface to open unknown URL type."""
1766        type, url = splittype(fullurl)
1767        raise OSError('url error', 'unknown url type', type)
1768
1769    def open_unknown_proxy(self, proxy, fullurl, data=None):
1770        """Overridable interface to open unknown URL type."""
1771        type, url = splittype(fullurl)
1772        raise OSError('url error', 'invalid proxy for %s' % type, proxy)
1773
1774    # External interface
1775    def retrieve(self, url, filename=None, reporthook=None, data=None):
1776        """retrieve(url) returns (filename, headers) for a local object
1777        or (tempfilename, headers) for a remote object."""
1778        url = unwrap(to_bytes(url))
1779        if self.tempcache and url in self.tempcache:
1780            return self.tempcache[url]
1781        type, url1 = splittype(url)
1782        if filename is None and (not type or type == 'file'):
1783            try:
1784                fp = self.open_local_file(url1)
1785                hdrs = fp.info()
1786                fp.close()
1787                return url2pathname(splithost(url1)[1]), hdrs
1788            except OSError as msg:
1789                pass
1790        fp = self.open(url, data)
1791        try:
1792            headers = fp.info()
1793            if filename:
1794                tfp = open(filename, 'wb')
1795            else:
1796                garbage, path = splittype(url)
1797                garbage, path = splithost(path or "")
1798                path, garbage = splitquery(path or "")
1799                path, garbage = splitattr(path or "")
1800                suffix = os.path.splitext(path)[1]
1801                (fd, filename) = tempfile.mkstemp(suffix)
1802                self.__tempfiles.append(filename)
1803                tfp = os.fdopen(fd, 'wb')
1804            try:
1805                result = filename, headers
1806                if self.tempcache is not None:
1807                    self.tempcache[url] = result
1808                bs = 1024*8
1809                size = -1
1810                read = 0
1811                blocknum = 0
1812                if "content-length" in headers:
1813                    size = int(headers["Content-Length"])
1814                if reporthook:
1815                    reporthook(blocknum, bs, size)
1816                while 1:
1817                    block = fp.read(bs)
1818                    if not block:
1819                        break
1820                    read += len(block)
1821                    tfp.write(block)
1822                    blocknum += 1
1823                    if reporthook:
1824                        reporthook(blocknum, bs, size)
1825            finally:
1826                tfp.close()
1827        finally:
1828            fp.close()
1829
1830        # raise exception if actual size does not match content-length header
1831        if size >= 0 and read < size:
1832            raise ContentTooShortError(
1833                "retrieval incomplete: got only %i out of %i bytes"
1834                % (read, size), result)
1835
1836        return result
1837
1838    # Each method named open_<type> knows how to open that type of URL
1839
1840    def _open_generic_http(self, connection_factory, url, data):
1841        """Make an HTTP connection using connection_class.
1842
1843        This is an internal method that should be called from
1844        open_http() or open_https().
1845
1846        Arguments:
1847        - connection_factory should take a host name and return an
1848          HTTPConnection instance.
1849        - url is the url to retrieval or a host, relative-path pair.
1850        - data is payload for a POST request or None.
1851        """
1852
1853        user_passwd = None
1854        proxy_passwd= None
1855        if isinstance(url, str):
1856            host, selector = splithost(url)
1857            if host:
1858                user_passwd, host = splituser(host)
1859                host = unquote(host)
1860            realhost = host
1861        else:
1862            host, selector = url
1863            # check whether the proxy contains authorization information
1864            proxy_passwd, host = splituser(host)
1865            # now we proceed with the url we want to obtain
1866            urltype, rest = splittype(selector)
1867            url = rest
1868            user_passwd = None
1869            if urltype.lower() != 'http':
1870                realhost = None
1871            else:
1872                realhost, rest = splithost(rest)
1873                if realhost:
1874                    user_passwd, realhost = splituser(realhost)
1875                if user_passwd:
1876                    selector = "%s://%s%s" % (urltype, realhost, rest)
1877                if proxy_bypass(realhost):
1878                    host = realhost
1879
1880        if not host: raise OSError('http error', 'no host given')
1881
1882        if proxy_passwd:
1883            proxy_passwd = unquote(proxy_passwd)
1884            proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii')
1885        else:
1886            proxy_auth = None
1887
1888        if user_passwd:
1889            user_passwd = unquote(user_passwd)
1890            auth = base64.b64encode(user_passwd.encode()).decode('ascii')
1891        else:
1892            auth = None
1893        http_conn = connection_factory(host)
1894        headers = {}
1895        if proxy_auth:
1896            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
1897        if auth:
1898            headers["Authorization"] =  "Basic %s" % auth
1899        if realhost:
1900            headers["Host"] = realhost
1901
1902        # Add Connection:close as we don't support persistent connections yet.
1903        # This helps in closing the socket and avoiding ResourceWarning
1904
1905        headers["Connection"] = "close"
1906
1907        for header, value in self.addheaders:
1908            headers[header] = value
1909
1910        if data is not None:
1911            headers["Content-Type"] = "application/x-www-form-urlencoded"
1912            http_conn.request("POST", selector, data, headers)
1913        else:
1914            http_conn.request("GET", selector, headers=headers)
1915
1916        try:
1917            response = http_conn.getresponse()
1918        except http.client.BadStatusLine:
1919            # something went wrong with the HTTP status line
1920            raise URLError("http protocol error: bad status line")
1921
1922        # According to RFC 2616, "2xx" code indicates that the client's
1923        # request was successfully received, understood, and accepted.
1924        if 200 <= response.status < 300:
1925            return addinfourl(response, response.msg, "http:" + url,
1926                              response.status)
1927        else:
1928            return self.http_error(
1929                url, response.fp,
1930                response.status, response.reason, response.msg, data)
1931
1932    def open_http(self, url, data=None):
1933        """Use HTTP protocol."""
1934        return self._open_generic_http(http.client.HTTPConnection, url, data)
1935
1936    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
1937        """Handle http errors.
1938
1939        Derived class can override this, or provide specific handlers
1940        named http_error_DDD where DDD is the 3-digit error code."""
1941        # First check if there's a specific handler for this error
1942        name = 'http_error_%d' % errcode
1943        if hasattr(self, name):
1944            method = getattr(self, name)
1945            if data is None:
1946                result = method(url, fp, errcode, errmsg, headers)
1947            else:
1948                result = method(url, fp, errcode, errmsg, headers, data)
1949            if result: return result
1950        return self.http_error_default(url, fp, errcode, errmsg, headers)
1951
1952    def http_error_default(self, url, fp, errcode, errmsg, headers):
1953        """Default error handler: close the connection and raise OSError."""
1954        fp.close()
1955        raise HTTPError(url, errcode, errmsg, headers, None)
1956
1957    if _have_ssl:
1958        def _https_connection(self, host):
1959            return http.client.HTTPSConnection(host,
1960                                           key_file=self.key_file,
1961                                           cert_file=self.cert_file)
1962
1963        def open_https(self, url, data=None):
1964            """Use HTTPS protocol."""
1965            return self._open_generic_http(self._https_connection, url, data)
1966
1967    def open_file(self, url):
1968        """Use local file or FTP depending on form of URL."""
1969        if not isinstance(url, str):
1970            raise URLError('file error: proxy support for file protocol currently not implemented')
1971        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
1972            raise ValueError("file:// scheme is supported only on localhost")
1973        else:
1974            return self.open_local_file(url)
1975
1976    def open_local_file(self, url):
1977        """Use local file."""
1978        import email.utils
1979        import mimetypes
1980        host, file = splithost(url)
1981        localname = url2pathname(file)
1982        try:
1983            stats = os.stat(localname)
1984        except OSError as e:
1985            raise URLError(e.strerror, e.filename)
1986        size = stats.st_size
1987        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1988        mtype = mimetypes.guess_type(url)[0]
1989        headers = email.message_from_string(
1990            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
1991            (mtype or 'text/plain', size, modified))
1992        if not host:
1993            urlfile = file
1994            if file[:1] == '/':
1995                urlfile = 'file://' + file
1996            return addinfourl(open(localname, 'rb'), headers, urlfile)
1997        host, port = splitport(host)
1998        if (not port
1999           and socket.gethostbyname(host) in ((localhost(),) + thishost())):
2000            urlfile = file
2001            if file[:1] == '/':
2002                urlfile = 'file://' + file
2003            elif file[:2] == './':
2004                raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
2005            return addinfourl(open(localname, 'rb'), headers, urlfile)
2006        raise URLError('local file error: not on local host')
2007
2008    def open_ftp(self, url):
2009        """Use FTP protocol."""
2010        if not isinstance(url, str):
2011            raise URLError('ftp error: proxy support for ftp protocol currently not implemented')
2012        import mimetypes
2013        host, path = splithost(url)
2014        if not host: raise URLError('ftp error: no host given')
2015        host, port = splitport(host)
2016        user, host = splituser(host)
2017        if user: user, passwd = splitpasswd(user)
2018        else: passwd = None
2019        host = unquote(host)
2020        user = unquote(user or '')
2021        passwd = unquote(passwd or '')
2022        host = socket.gethostbyname(host)
2023        if not port:
2024            import ftplib
2025            port = ftplib.FTP_PORT
2026        else:
2027            port = int(port)
2028        path, attrs = splitattr(path)
2029        path = unquote(path)
2030        dirs = path.split('/')
2031        dirs, file = dirs[:-1], dirs[-1]
2032        if dirs and not dirs[0]: dirs = dirs[1:]
2033        if dirs and not dirs[0]: dirs[0] = '/'
2034        key = user, host, port, '/'.join(dirs)
2035        # XXX thread unsafe!
2036        if len(self.ftpcache) > MAXFTPCACHE:
2037            # Prune the cache, rather arbitrarily
2038            for k in list(self.ftpcache):
2039                if k != key:
2040                    v = self.ftpcache[k]
2041                    del self.ftpcache[k]
2042                    v.close()
2043        try:
2044            if key not in self.ftpcache:
2045                self.ftpcache[key] = \
2046                    ftpwrapper(user, passwd, host, port, dirs)
2047            if not file: type = 'D'
2048            else: type = 'I'
2049            for attr in attrs:
2050                attr, value = splitvalue(attr)
2051                if attr.lower() == 'type' and \
2052                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
2053                    type = value.upper()
2054            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
2055            mtype = mimetypes.guess_type("ftp:" + url)[0]
2056            headers = ""
2057            if mtype:
2058                headers += "Content-Type: %s\n" % mtype
2059            if retrlen is not None and retrlen >= 0:
2060                headers += "Content-Length: %d\n" % retrlen
2061            headers = email.message_from_string(headers)
2062            return addinfourl(fp, headers, "ftp:" + url)
2063        except ftperrors() as exp:
2064            raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2])
2065
2066    def open_data(self, url, data=None):
2067        """Use "data" URL."""
2068        if not isinstance(url, str):
2069            raise URLError('data error: proxy support for data protocol currently not implemented')
2070        # ignore POSTed data
2071        #
2072        # syntax of data URLs:
2073        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
2074        # mediatype := [ type "/" subtype ] *( ";" parameter )
2075        # data      := *urlchar
2076        # parameter := attribute "=" value
2077        try:
2078            [type, data] = url.split(',', 1)
2079        except ValueError:
2080            raise OSError('data error', 'bad data URL')
2081        if not type:
2082            type = 'text/plain;charset=US-ASCII'
2083        semi = type.rfind(';')
2084        if semi >= 0 and '=' not in type[semi:]:
2085            encoding = type[semi+1:]
2086            type = type[:semi]
2087        else:
2088            encoding = ''
2089        msg = []
2090        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
2091                                            time.gmtime(time.time())))
2092        msg.append('Content-type: %s' % type)
2093        if encoding == 'base64':
2094            # XXX is this encoding/decoding ok?
2095            data = base64.decodebytes(data.encode('ascii')).decode('latin-1')
2096        else:
2097            data = unquote(data)
2098        msg.append('Content-Length: %d' % len(data))
2099        msg.append('')
2100        msg.append(data)
2101        msg = '\n'.join(msg)
2102        headers = email.message_from_string(msg)
2103        f = io.StringIO(msg)
2104        #f.fileno = None     # needed for addinfourl
2105        return addinfourl(f, headers, url)
2106
2107
2108class FancyURLopener(URLopener):
2109    """Derived class with handlers for errors we can handle (perhaps)."""
2110
2111    def __init__(self, *args, **kwargs):
2112        URLopener.__init__(self, *args, **kwargs)
2113        self.auth_cache = {}
2114        self.tries = 0
2115        self.maxtries = 10
2116
2117    def http_error_default(self, url, fp, errcode, errmsg, headers):
2118        """Default error handling -- don't raise an exception."""
2119        return addinfourl(fp, headers, "http:" + url, errcode)
2120
2121    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
2122        """Error 302 -- relocated (temporarily)."""
2123        self.tries += 1
2124        try:
2125            if self.maxtries and self.tries >= self.maxtries:
2126                if hasattr(self, "http_error_500"):
2127                    meth = self.http_error_500
2128                else:
2129                    meth = self.http_error_default
2130                return meth(url, fp, 500,
2131                            "Internal Server Error: Redirect Recursion",
2132                            headers)
2133            result = self.redirect_internal(url, fp, errcode, errmsg,
2134                                            headers, data)
2135            return result
2136        finally:
2137            self.tries = 0
2138
2139    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
2140        if 'location' in headers:
2141            newurl = headers['location']
2142        elif 'uri' in headers:
2143            newurl = headers['uri']
2144        else:
2145            return
2146        fp.close()
2147
2148        # In case the server sent a relative URL, join with original:
2149        newurl = urljoin(self.type + ":" + url, newurl)
2150
2151        urlparts = urlparse(newurl)
2152
2153        # For security reasons, we don't allow redirection to anything other
2154        # than http, https and ftp.
2155
2156        # We are using newer HTTPError with older redirect_internal method
2157        # This older method will get deprecated in 3.3
2158
2159        if urlparts.scheme not in ('http', 'https', 'ftp', ''):
2160            raise HTTPError(newurl, errcode,
2161                            errmsg +
2162                            " Redirection to url '%s' is not allowed." % newurl,
2163                            headers, fp)
2164
2165        return self.open(newurl)
2166
2167    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
2168        """Error 301 -- also relocated (permanently)."""
2169        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2170
2171    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
2172        """Error 303 -- also relocated (essentially identical to 302)."""
2173        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2174
2175    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
2176        """Error 307 -- relocated, but turn POST into error."""
2177        if data is None:
2178            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
2179        else:
2180            return self.http_error_default(url, fp, errcode, errmsg, headers)
2181
2182    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None,
2183            retry=False):
2184        """Error 401 -- authentication required.
2185        This function supports Basic authentication only."""
2186        if 'www-authenticate' not in headers:
2187            URLopener.http_error_default(self, url, fp,
2188                                         errcode, errmsg, headers)
2189        stuff = headers['www-authenticate']
2190        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2191        if not match:
2192            URLopener.http_error_default(self, url, fp,
2193                                         errcode, errmsg, headers)
2194        scheme, realm = match.groups()
2195        if scheme.lower() != 'basic':
2196            URLopener.http_error_default(self, url, fp,
2197                                         errcode, errmsg, headers)
2198        if not retry:
2199            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2200                    headers)
2201        name = 'retry_' + self.type + '_basic_auth'
2202        if data is None:
2203            return getattr(self,name)(url, realm)
2204        else:
2205            return getattr(self,name)(url, realm, data)
2206
2207    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None,
2208            retry=False):
2209        """Error 407 -- proxy authentication required.
2210        This function supports Basic authentication only."""
2211        if 'proxy-authenticate' not in headers:
2212            URLopener.http_error_default(self, url, fp,
2213                                         errcode, errmsg, headers)
2214        stuff = headers['proxy-authenticate']
2215        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
2216        if not match:
2217            URLopener.http_error_default(self, url, fp,
2218                                         errcode, errmsg, headers)
2219        scheme, realm = match.groups()
2220        if scheme.lower() != 'basic':
2221            URLopener.http_error_default(self, url, fp,
2222                                         errcode, errmsg, headers)
2223        if not retry:
2224            URLopener.http_error_default(self, url, fp, errcode, errmsg,
2225                    headers)
2226        name = 'retry_proxy_' + self.type + '_basic_auth'
2227        if data is None:
2228            return getattr(self,name)(url, realm)
2229        else:
2230            return getattr(self,name)(url, realm, data)
2231
2232    def retry_proxy_http_basic_auth(self, url, realm, data=None):
2233        host, selector = splithost(url)
2234        newurl = 'http://' + host + selector
2235        proxy = self.proxies['http']
2236        urltype, proxyhost = splittype(proxy)
2237        proxyhost, proxyselector = splithost(proxyhost)
2238        i = proxyhost.find('@') + 1
2239        proxyhost = proxyhost[i:]
2240        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2241        if not (user or passwd): return None
2242        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2243                                  quote(passwd, safe=''), proxyhost)
2244        self.proxies['http'] = 'http://' + proxyhost + proxyselector
2245        if data is None:
2246            return self.open(newurl)
2247        else:
2248            return self.open(newurl, data)
2249
2250    def retry_proxy_https_basic_auth(self, url, realm, data=None):
2251        host, selector = splithost(url)
2252        newurl = 'https://' + host + selector
2253        proxy = self.proxies['https']
2254        urltype, proxyhost = splittype(proxy)
2255        proxyhost, proxyselector = splithost(proxyhost)
2256        i = proxyhost.find('@') + 1
2257        proxyhost = proxyhost[i:]
2258        user, passwd = self.get_user_passwd(proxyhost, realm, i)
2259        if not (user or passwd): return None
2260        proxyhost = "%s:%s@%s" % (quote(user, safe=''),
2261                                  quote(passwd, safe=''), proxyhost)
2262        self.proxies['https'] = 'https://' + proxyhost + proxyselector
2263        if data is None:
2264            return self.open(newurl)
2265        else:
2266            return self.open(newurl, data)
2267
2268    def retry_http_basic_auth(self, url, realm, data=None):
2269        host, selector = splithost(url)
2270        i = host.find('@') + 1
2271        host = host[i:]
2272        user, passwd = self.get_user_passwd(host, realm, i)
2273        if not (user or passwd): return None
2274        host = "%s:%s@%s" % (quote(user, safe=''),
2275                             quote(passwd, safe=''), host)
2276        newurl = 'http://' + host + selector
2277        if data is None:
2278            return self.open(newurl)
2279        else:
2280            return self.open(newurl, data)
2281
2282    def retry_https_basic_auth(self, url, realm, data=None):
2283        host, selector = splithost(url)
2284        i = host.find('@') + 1
2285        host = host[i:]
2286        user, passwd = self.get_user_passwd(host, realm, i)
2287        if not (user or passwd): return None
2288        host = "%s:%s@%s" % (quote(user, safe=''),
2289                             quote(passwd, safe=''), host)
2290        newurl = 'https://' + host + selector
2291        if data is None:
2292            return self.open(newurl)
2293        else:
2294            return self.open(newurl, data)
2295
2296    def get_user_passwd(self, host, realm, clear_cache=0):
2297        key = realm + '@' + host.lower()
2298        if key in self.auth_cache:
2299            if clear_cache:
2300                del self.auth_cache[key]
2301            else:
2302                return self.auth_cache[key]
2303        user, passwd = self.prompt_user_passwd(host, realm)
2304        if user or passwd: self.auth_cache[key] = (user, passwd)
2305        return user, passwd
2306
2307    def prompt_user_passwd(self, host, realm):
2308        """Override this in a GUI environment!"""
2309        import getpass
2310        try:
2311            user = input("Enter username for %s at %s: " % (realm, host))
2312            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
2313                (user, realm, host))
2314            return user, passwd
2315        except KeyboardInterrupt:
2316            print()
2317            return None, None
2318
2319
2320# Utility functions
2321
2322_localhost = None
2323def localhost():
2324    """Return the IP address of the magic hostname 'localhost'."""
2325    global _localhost
2326    if _localhost is None:
2327        _localhost = socket.gethostbyname('localhost')
2328    return _localhost
2329
2330_thishost = None
2331def thishost():
2332    """Return the IP addresses of the current host."""
2333    global _thishost
2334    if _thishost is None:
2335        try:
2336            _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2])
2337        except socket.gaierror:
2338            _thishost = tuple(socket.gethostbyname_ex('localhost')[2])
2339    return _thishost
2340
2341_ftperrors = None
2342def ftperrors():
2343    """Return the set of errors raised by the FTP class."""
2344    global _ftperrors
2345    if _ftperrors is None:
2346        import ftplib
2347        _ftperrors = ftplib.all_errors
2348    return _ftperrors
2349
2350_noheaders = None
2351def noheaders():
2352    """Return an empty email Message object."""
2353    global _noheaders
2354    if _noheaders is None:
2355        _noheaders = email.message_from_string("")
2356    return _noheaders
2357
2358
2359# Utility classes
2360
2361class ftpwrapper:
2362    """Class used by open_ftp() for cache of open FTP connections."""
2363
2364    def __init__(self, user, passwd, host, port, dirs, timeout=None,
2365                 persistent=True):
2366        self.user = user
2367        self.passwd = passwd
2368        self.host = host
2369        self.port = port
2370        self.dirs = dirs
2371        self.timeout = timeout
2372        self.refcount = 0
2373        self.keepalive = persistent
2374        try:
2375            self.init()
2376        except:
2377            self.close()
2378            raise
2379
2380    def init(self):
2381        import ftplib
2382        self.busy = 0
2383        self.ftp = ftplib.FTP()
2384        self.ftp.connect(self.host, self.port, self.timeout)
2385        self.ftp.login(self.user, self.passwd)
2386        _target = '/'.join(self.dirs)
2387        self.ftp.cwd(_target)
2388
2389    def retrfile(self, file, type):
2390        import ftplib
2391        self.endtransfer()
2392        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
2393        else: cmd = 'TYPE ' + type; isdir = 0
2394        try:
2395            self.ftp.voidcmd(cmd)
2396        except ftplib.all_errors:
2397            self.init()
2398            self.ftp.voidcmd(cmd)
2399        conn = None
2400        if file and not isdir:
2401            # Try to retrieve as a file
2402            try:
2403                cmd = 'RETR ' + file
2404                conn, retrlen = self.ftp.ntransfercmd(cmd)
2405            except ftplib.error_perm as reason:
2406                if str(reason)[:3] != '550':
2407                    raise URLError('ftp error: %r' % reason).with_traceback(
2408                        sys.exc_info()[2])
2409        if not conn:
2410            # Set transfer mode to ASCII!
2411            self.ftp.voidcmd('TYPE A')
2412            # Try a directory listing. Verify that directory exists.
2413            if file:
2414                pwd = self.ftp.pwd()
2415                try:
2416                    try:
2417                        self.ftp.cwd(file)
2418                    except ftplib.error_perm as reason:
2419                        raise URLError('ftp error: %r' % reason) from reason
2420                finally:
2421                    self.ftp.cwd(pwd)
2422                cmd = 'LIST ' + file
2423            else:
2424                cmd = 'LIST'
2425            conn, retrlen = self.ftp.ntransfercmd(cmd)
2426        self.busy = 1
2427
2428        ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
2429        self.refcount += 1
2430        conn.close()
2431        # Pass back both a suitably decorated object and a retrieval length
2432        return (ftpobj, retrlen)
2433
2434    def endtransfer(self):
2435        self.busy = 0
2436
2437    def close(self):
2438        self.keepalive = False
2439        if self.refcount <= 0:
2440            self.real_close()
2441
2442    def file_close(self):
2443        self.endtransfer()
2444        self.refcount -= 1
2445        if self.refcount <= 0 and not self.keepalive:
2446            self.real_close()
2447
2448    def real_close(self):
2449        self.endtransfer()
2450        try:
2451            self.ftp.close()
2452        except ftperrors():
2453            pass
2454
2455# Proxy handling
2456def getproxies_environment():
2457    """Return a dictionary of scheme -> proxy server URL mappings.
2458
2459    Scan the environment for variables named <scheme>_proxy;
2460    this seems to be the standard convention.  If you need a
2461    different way, you can pass a proxies dictionary to the
2462    [Fancy]URLopener constructor.
2463
2464    """
2465    proxies = {}
2466    # in order to prefer lowercase variables, process environment in
2467    # two passes: first matches any, second pass matches lowercase only
2468    for name, value in os.environ.items():
2469        name = name.lower()
2470        if value and name[-6:] == '_proxy':
2471            proxies[name[:-6]] = value
2472    # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
2473    # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
2474    # header from the client
2475    # If "proxy" is lowercase, it will still be used thanks to the next block
2476    if 'REQUEST_METHOD' in os.environ:
2477        proxies.pop('http', None)
2478    for name, value in os.environ.items():
2479        if name[-6:] == '_proxy':
2480            name = name.lower()
2481            if value:
2482                proxies[name[:-6]] = value
2483            else:
2484                proxies.pop(name[:-6], None)
2485    return proxies
2486
2487def proxy_bypass_environment(host, proxies=None):
2488    """Test if proxies should not be used for a particular host.
2489
2490    Checks the proxy dict for the value of no_proxy, which should
2491    be a list of comma separated DNS suffixes, or '*' for all hosts.
2492
2493    """
2494    if proxies is None:
2495        proxies = getproxies_environment()
2496    # don't bypass, if no_proxy isn't specified
2497    try:
2498        no_proxy = proxies['no']
2499    except KeyError:
2500        return 0
2501    # '*' is special case for always bypass
2502    if no_proxy == '*':
2503        return 1
2504    # strip port off host
2505    hostonly, port = splitport(host)
2506    # check if the host ends with any of the DNS suffixes
2507    no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
2508    for name in no_proxy_list:
2509        if name:
2510            name = name.lstrip('.')  # ignore leading dots
2511            name = re.escape(name)
2512            pattern = r'(.+\.)?%s$' % name
2513            if (re.match(pattern, hostonly, re.I)
2514                    or re.match(pattern, host, re.I)):
2515                return 1
2516    # otherwise, don't bypass
2517    return 0
2518
2519
2520# This code tests an OSX specific data structure but is testable on all
2521# platforms
2522def _proxy_bypass_macosx_sysconf(host, proxy_settings):
2523    """
2524    Return True iff this host shouldn't be accessed using a proxy
2525
2526    This function uses the MacOSX framework SystemConfiguration
2527    to fetch the proxy information.
2528
2529    proxy_settings come from _scproxy._get_proxy_settings or get mocked ie:
2530    { 'exclude_simple': bool,
2531      'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16']
2532    }
2533    """
2534    from fnmatch import fnmatch
2535
2536    hostonly, port = splitport(host)
2537
2538    def ip2num(ipAddr):
2539        parts = ipAddr.split('.')
2540        parts = list(map(int, parts))
2541        if len(parts) != 4:
2542            parts = (parts + [0, 0, 0, 0])[:4]
2543        return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
2544
2545    # Check for simple host names:
2546    if '.' not in host:
2547        if proxy_settings['exclude_simple']:
2548            return True
2549
2550    hostIP = None
2551
2552    for value in proxy_settings.get('exceptions', ()):
2553        # Items in the list are strings like these: *.local, 169.254/16
2554        if not value: continue
2555
2556        m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
2557        if m is not None:
2558            if hostIP is None:
2559                try:
2560                    hostIP = socket.gethostbyname(hostonly)
2561                    hostIP = ip2num(hostIP)
2562                except OSError:
2563                    continue
2564
2565            base = ip2num(m.group(1))
2566            mask = m.group(2)
2567            if mask is None:
2568                mask = 8 * (m.group(1).count('.') + 1)
2569            else:
2570                mask = int(mask[1:])
2571            mask = 32 - mask
2572
2573            if (hostIP >> mask) == (base >> mask):
2574                return True
2575
2576        elif fnmatch(host, value):
2577            return True
2578
2579    return False
2580
2581
2582if sys.platform == 'darwin':
2583    from _scproxy import _get_proxy_settings, _get_proxies
2584
2585    def proxy_bypass_macosx_sysconf(host):
2586        proxy_settings = _get_proxy_settings()
2587        return _proxy_bypass_macosx_sysconf(host, proxy_settings)
2588
2589    def getproxies_macosx_sysconf():
2590        """Return a dictionary of scheme -> proxy server URL mappings.
2591
2592        This function uses the MacOSX framework SystemConfiguration
2593        to fetch the proxy information.
2594        """
2595        return _get_proxies()
2596
2597
2598
2599    def proxy_bypass(host):
2600        """Return True, if host should be bypassed.
2601
2602        Checks proxy settings gathered from the environment, if specified,
2603        or from the MacOSX framework SystemConfiguration.
2604
2605        """
2606        proxies = getproxies_environment()
2607        if proxies:
2608            return proxy_bypass_environment(host, proxies)
2609        else:
2610            return proxy_bypass_macosx_sysconf(host)
2611
2612    def getproxies():
2613        return getproxies_environment() or getproxies_macosx_sysconf()
2614
2615
2616elif os.name == 'nt':
2617    def getproxies_registry():
2618        """Return a dictionary of scheme -> proxy server URL mappings.
2619
2620        Win32 uses the registry to store proxies.
2621
2622        """
2623        proxies = {}
2624        try:
2625            import winreg
2626        except ImportError:
2627            # Std module, so should be around - but you never know!
2628            return proxies
2629        try:
2630            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2631                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2632            proxyEnable = winreg.QueryValueEx(internetSettings,
2633                                               'ProxyEnable')[0]
2634            if proxyEnable:
2635                # Returned as Unicode but problems if not converted to ASCII
2636                proxyServer = str(winreg.QueryValueEx(internetSettings,
2637                                                       'ProxyServer')[0])
2638                if '=' in proxyServer:
2639                    # Per-protocol settings
2640                    for p in proxyServer.split(';'):
2641                        protocol, address = p.split('=', 1)
2642                        # See if address has a type:// prefix
2643                        if not re.match('^([^/:]+)://', address):
2644                            address = '%s://%s' % (protocol, address)
2645                        proxies[protocol] = address
2646                else:
2647                    # Use one setting for all protocols
2648                    if proxyServer[:5] == 'http:':
2649                        proxies['http'] = proxyServer
2650                    else:
2651                        proxies['http'] = 'http://%s' % proxyServer
2652                        proxies['https'] = 'https://%s' % proxyServer
2653                        proxies['ftp'] = 'ftp://%s' % proxyServer
2654            internetSettings.Close()
2655        except (OSError, ValueError, TypeError):
2656            # Either registry key not found etc, or the value in an
2657            # unexpected format.
2658            # proxies already set up to be empty so nothing to do
2659            pass
2660        return proxies
2661
2662    def getproxies():
2663        """Return a dictionary of scheme -> proxy server URL mappings.
2664
2665        Returns settings gathered from the environment, if specified,
2666        or the registry.
2667
2668        """
2669        return getproxies_environment() or getproxies_registry()
2670
2671    def proxy_bypass_registry(host):
2672        try:
2673            import winreg
2674        except ImportError:
2675            # Std modules, so should be around - but you never know!
2676            return 0
2677        try:
2678            internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER,
2679                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
2680            proxyEnable = winreg.QueryValueEx(internetSettings,
2681                                               'ProxyEnable')[0]
2682            proxyOverride = str(winreg.QueryValueEx(internetSettings,
2683                                                     'ProxyOverride')[0])
2684            # ^^^^ Returned as Unicode but problems if not converted to ASCII
2685        except OSError:
2686            return 0
2687        if not proxyEnable or not proxyOverride:
2688            return 0
2689        # try to make a host list from name and IP address.
2690        rawHost, port = splitport(host)
2691        host = [rawHost]
2692        try:
2693            addr = socket.gethostbyname(rawHost)
2694            if addr != rawHost:
2695                host.append(addr)
2696        except OSError:
2697            pass
2698        try:
2699            fqdn = socket.getfqdn(rawHost)
2700            if fqdn != rawHost:
2701                host.append(fqdn)
2702        except OSError:
2703            pass
2704        # make a check value list from the registry entry: replace the
2705        # '<local>' string by the localhost entry and the corresponding
2706        # canonical entry.
2707        proxyOverride = proxyOverride.split(';')
2708        # now check if we match one of the registry values.
2709        for test in proxyOverride:
2710            if test == '<local>':
2711                if '.' not in rawHost:
2712                    return 1
2713            test = test.replace(".", r"\.")     # mask dots
2714            test = test.replace("*", r".*")     # change glob sequence
2715            test = test.replace("?", r".")      # change glob char
2716            for val in host:
2717                if re.match(test, val, re.I):
2718                    return 1
2719        return 0
2720
2721    def proxy_bypass(host):
2722        """Return True, if host should be bypassed.
2723
2724        Checks proxy settings gathered from the environment, if specified,
2725        or the registry.
2726
2727        """
2728        proxies = getproxies_environment()
2729        if proxies:
2730            return proxy_bypass_environment(host, proxies)
2731        else:
2732            return proxy_bypass_registry(host)
2733
2734else:
2735    # By default use environment variables
2736    getproxies = getproxies_environment
2737    proxy_bypass = proxy_bypass_environment
2738