1"""An extensible library for opening URLs using a variety of protocols 2 3The simplest way to use this module is to call the urlopen function, 4which accepts a string containing a URL or a Request object (described 5below). It opens the URL and returns the results as file-like 6object; the returned object has some extra methods described below. 7 8The OpenerDirector manages a collection of Handler objects that do 9all the actual work. Each Handler implements a particular protocol or 10option. The OpenerDirector is a composite object that invokes the 11Handlers needed to open the requested URL. For example, the 12HTTPHandler performs HTTP GET and POST requests and deals with 13non-error returns. The HTTPRedirectHandler automatically deals with 14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15deals with digest authentication. 16 17urlopen(url, data=None) -- Basic usage is the same as original 18urllib. pass the url and optionally data to post to an HTTP URL, and 19get a file-like object back. One difference is that you can also pass 20a Request instance instead of URL. Raises a URLError (subclass of 21OSError); for HTTP errors, raises an HTTPError, which can also be 22treated as a valid response. 23 24build_opener -- Function that creates a new OpenerDirector instance. 25Will install the default handlers. Accepts one or more Handlers as 26arguments, either instances or Handler classes that it will 27instantiate. If one of the argument is a subclass of the default 28handler, the argument will be installed instead of the default. 29 30install_opener -- Installs a new opener as the default opener. 31 32objects of interest: 33 34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35the Handler classes, while dealing with requests and responses. 36 37Request -- An object that encapsulates the state of a request. The 38state can be as simple as the URL. It can also include extra HTTP 39headers, e.g. a User-Agent. 40 41BaseHandler -- 42 43internals: 44BaseHandler and parent 45_call_chain conventions 46 47Example usage: 48 49import urllib.request 50 51# set up authentication info 52authinfo = urllib.request.HTTPBasicAuthHandler() 53authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60# build a new opener that adds authentication and caching FTP handlers 61opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64# install it 65urllib.request.install_opener(opener) 66 67f = urllib.request.urlopen('http://www.python.org/') 68""" 69 70# XXX issues: 71# If an authentication error handler that tries to perform 72# authentication for some reason but fails, how should the error be 73# signalled? The client needs to know the HTTP error code. But if 74# the handler knows that the problem was, e.g., that it didn't know 75# that hash algo that requested in the challenge, it would be good to 76# pass that information along to the client, too. 77# ftp errors aren't handled cleanly 78# check digest against correct (i.e. non-apache) implementation 79 80# Possible extensions: 81# complex proxies XXX not sure what exactly was meant by this 82# abstract factory for opener 83 84import base64 85import bisect 86import email 87import hashlib 88import http.client 89import io 90import os 91import posixpath 92import re 93import socket 94import string 95import sys 96import time 97import tempfile 98import contextlib 99import warnings 100 101 102from urllib.error import URLError, HTTPError, ContentTooShortError 103from urllib.parse import ( 104 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 105 _splittype, _splithost, _splitport, _splituser, _splitpasswd, 106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, 107 unquote_to_bytes, urlunparse) 108from urllib.response import addinfourl, addclosehook 109 110# check for SSL 111try: 112 import ssl 113except ImportError: 114 _have_ssl = False 115else: 116 _have_ssl = True 117 118__all__ = [ 119 # Classes 120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 127 'UnknownHandler', 'HTTPErrorProcessor', 128 # Functions 129 'urlopen', 'install_opener', 'build_opener', 130 'pathname2url', 'url2pathname', 'getproxies', 131 # Legacy interface 132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 133] 134 135# used in User-Agent header sent 136__version__ = '%d.%d' % sys.version_info[:2] 137 138_opener = None 139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 140 *, cafile=None, capath=None, cadefault=False, context=None): 141 '''Open the URL url, which can be either a string or a Request object. 142 143 *data* must be an object specifying additional data to be sent to 144 the server, or None if no such data is needed. See Request for 145 details. 146 147 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 148 header in its HTTP requests. 149 150 The optional *timeout* parameter specifies a timeout in seconds for 151 blocking operations like the connection attempt (if not specified, the 152 global default timeout setting will be used). This only works for HTTP, 153 HTTPS and FTP connections. 154 155 If *context* is specified, it must be a ssl.SSLContext instance describing 156 the various SSL options. See HTTPSConnection for more details. 157 158 The optional *cafile* and *capath* parameters specify a set of trusted CA 159 certificates for HTTPS requests. cafile should point to a single file 160 containing a bundle of CA certificates, whereas capath should point to a 161 directory of hashed certificate files. More information can be found in 162 ssl.SSLContext.load_verify_locations(). 163 164 The *cadefault* parameter is ignored. 165 166 167 This function always returns an object which can work as a 168 context manager and has the properties url, headers, and status. 169 See urllib.response.addinfourl for more detail on these properties. 170 171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 172 object slightly modified. In addition to the three new methods above, the 173 msg attribute contains the same information as the reason attribute --- 174 the reason phrase returned by the server --- instead of the response 175 headers as it is specified in the documentation for HTTPResponse. 176 177 For FTP, file, and data URLs and requests explicitly handled by legacy 178 URLopener and FancyURLopener classes, this function returns a 179 urllib.response.addinfourl object. 180 181 Note that None may be returned if no handler handles the request (though 182 the default installed global OpenerDirector uses UnknownHandler to ensure 183 this never happens). 184 185 In addition, if proxy settings are detected (for example, when a *_proxy 186 environment variable like http_proxy is set), ProxyHandler is default 187 installed and makes sure the requests are handled through the proxy. 188 189 ''' 190 global _opener 191 if cafile or capath or cadefault: 192 import warnings 193 warnings.warn("cafile, capath and cadefault are deprecated, use a " 194 "custom context instead.", DeprecationWarning, 2) 195 if context is not None: 196 raise ValueError( 197 "You can't pass both context and any of cafile, capath, and " 198 "cadefault" 199 ) 200 if not _have_ssl: 201 raise ValueError('SSL support not available') 202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, 203 cafile=cafile, 204 capath=capath) 205 https_handler = HTTPSHandler(context=context) 206 opener = build_opener(https_handler) 207 elif context: 208 https_handler = HTTPSHandler(context=context) 209 opener = build_opener(https_handler) 210 elif _opener is None: 211 _opener = opener = build_opener() 212 else: 213 opener = _opener 214 return opener.open(url, data, timeout) 215 216def install_opener(opener): 217 global _opener 218 _opener = opener 219 220_url_tempfiles = [] 221def urlretrieve(url, filename=None, reporthook=None, data=None): 222 """ 223 Retrieve a URL into a temporary location on disk. 224 225 Requires a URL argument. If a filename is passed, it is used as 226 the temporary file location. The reporthook argument should be 227 a callable that accepts a block number, a read size, and the 228 total file size of the URL target. The data argument should be 229 valid URL encoded data. 230 231 If a filename is passed and the URL points to a local resource, 232 the result is a copy from local file to new file. 233 234 Returns a tuple containing the path to the newly created 235 data file as well as the resulting HTTPMessage object. 236 """ 237 url_type, path = _splittype(url) 238 239 with contextlib.closing(urlopen(url, data)) as fp: 240 headers = fp.info() 241 242 # Just return the local path and the "headers" for file:// 243 # URLs. No sense in performing a copy unless requested. 244 if url_type == "file" and not filename: 245 return os.path.normpath(path), headers 246 247 # Handle temporary file setup. 248 if filename: 249 tfp = open(filename, 'wb') 250 else: 251 tfp = tempfile.NamedTemporaryFile(delete=False) 252 filename = tfp.name 253 _url_tempfiles.append(filename) 254 255 with tfp: 256 result = filename, headers 257 bs = 1024*8 258 size = -1 259 read = 0 260 blocknum = 0 261 if "content-length" in headers: 262 size = int(headers["Content-Length"]) 263 264 if reporthook: 265 reporthook(blocknum, bs, size) 266 267 while True: 268 block = fp.read(bs) 269 if not block: 270 break 271 read += len(block) 272 tfp.write(block) 273 blocknum += 1 274 if reporthook: 275 reporthook(blocknum, bs, size) 276 277 if size >= 0 and read < size: 278 raise ContentTooShortError( 279 "retrieval incomplete: got only %i out of %i bytes" 280 % (read, size), result) 281 282 return result 283 284def urlcleanup(): 285 """Clean up temporary files from urlretrieve calls.""" 286 for temp_file in _url_tempfiles: 287 try: 288 os.unlink(temp_file) 289 except OSError: 290 pass 291 292 del _url_tempfiles[:] 293 global _opener 294 if _opener: 295 _opener = None 296 297# copied from cookielib.py 298_cut_port_re = re.compile(r":\d+$", re.ASCII) 299def request_host(request): 300 """Return request-host, as defined by RFC 2965. 301 302 Variation from RFC: returned value is lowercased, for convenient 303 comparison. 304 305 """ 306 url = request.full_url 307 host = urlparse(url)[1] 308 if host == "": 309 host = request.get_header("Host", "") 310 311 # remove port, if present 312 host = _cut_port_re.sub("", host, 1) 313 return host.lower() 314 315class Request: 316 317 def __init__(self, url, data=None, headers={}, 318 origin_req_host=None, unverifiable=False, 319 method=None): 320 self.full_url = url 321 self.headers = {} 322 self.unredirected_hdrs = {} 323 self._data = None 324 self.data = data 325 self._tunnel_host = None 326 for key, value in headers.items(): 327 self.add_header(key, value) 328 if origin_req_host is None: 329 origin_req_host = request_host(self) 330 self.origin_req_host = origin_req_host 331 self.unverifiable = unverifiable 332 if method: 333 self.method = method 334 335 @property 336 def full_url(self): 337 if self.fragment: 338 return '{}#{}'.format(self._full_url, self.fragment) 339 return self._full_url 340 341 @full_url.setter 342 def full_url(self, url): 343 # unwrap('<URL:type://host/path>') --> 'type://host/path' 344 self._full_url = unwrap(url) 345 self._full_url, self.fragment = _splittag(self._full_url) 346 self._parse() 347 348 @full_url.deleter 349 def full_url(self): 350 self._full_url = None 351 self.fragment = None 352 self.selector = '' 353 354 @property 355 def data(self): 356 return self._data 357 358 @data.setter 359 def data(self, data): 360 if data != self._data: 361 self._data = data 362 # issue 16464 363 # if we change data we need to remove content-length header 364 # (cause it's most probably calculated for previous value) 365 if self.has_header("Content-length"): 366 self.remove_header("Content-length") 367 368 @data.deleter 369 def data(self): 370 self.data = None 371 372 def _parse(self): 373 self.type, rest = _splittype(self._full_url) 374 if self.type is None: 375 raise ValueError("unknown url type: %r" % self.full_url) 376 self.host, self.selector = _splithost(rest) 377 if self.host: 378 self.host = unquote(self.host) 379 380 def get_method(self): 381 """Return a string indicating the HTTP request method.""" 382 default_method = "POST" if self.data is not None else "GET" 383 return getattr(self, 'method', default_method) 384 385 def get_full_url(self): 386 return self.full_url 387 388 def set_proxy(self, host, type): 389 if self.type == 'https' and not self._tunnel_host: 390 self._tunnel_host = self.host 391 else: 392 self.type= type 393 self.selector = self.full_url 394 self.host = host 395 396 def has_proxy(self): 397 return self.selector == self.full_url 398 399 def add_header(self, key, val): 400 # useful for something like authentication 401 self.headers[key.capitalize()] = val 402 403 def add_unredirected_header(self, key, val): 404 # will not be added to a redirected request 405 self.unredirected_hdrs[key.capitalize()] = val 406 407 def has_header(self, header_name): 408 return (header_name in self.headers or 409 header_name in self.unredirected_hdrs) 410 411 def get_header(self, header_name, default=None): 412 return self.headers.get( 413 header_name, 414 self.unredirected_hdrs.get(header_name, default)) 415 416 def remove_header(self, header_name): 417 self.headers.pop(header_name, None) 418 self.unredirected_hdrs.pop(header_name, None) 419 420 def header_items(self): 421 hdrs = {**self.unredirected_hdrs, **self.headers} 422 return list(hdrs.items()) 423 424class OpenerDirector: 425 def __init__(self): 426 client_version = "Python-urllib/%s" % __version__ 427 self.addheaders = [('User-agent', client_version)] 428 # self.handlers is retained only for backward compatibility 429 self.handlers = [] 430 # manage the individual handlers 431 self.handle_open = {} 432 self.handle_error = {} 433 self.process_response = {} 434 self.process_request = {} 435 436 def add_handler(self, handler): 437 if not hasattr(handler, "add_parent"): 438 raise TypeError("expected BaseHandler instance, got %r" % 439 type(handler)) 440 441 added = False 442 for meth in dir(handler): 443 if meth in ["redirect_request", "do_open", "proxy_open"]: 444 # oops, coincidental match 445 continue 446 447 i = meth.find("_") 448 protocol = meth[:i] 449 condition = meth[i+1:] 450 451 if condition.startswith("error"): 452 j = condition.find("_") + i + 1 453 kind = meth[j+1:] 454 try: 455 kind = int(kind) 456 except ValueError: 457 pass 458 lookup = self.handle_error.get(protocol, {}) 459 self.handle_error[protocol] = lookup 460 elif condition == "open": 461 kind = protocol 462 lookup = self.handle_open 463 elif condition == "response": 464 kind = protocol 465 lookup = self.process_response 466 elif condition == "request": 467 kind = protocol 468 lookup = self.process_request 469 else: 470 continue 471 472 handlers = lookup.setdefault(kind, []) 473 if handlers: 474 bisect.insort(handlers, handler) 475 else: 476 handlers.append(handler) 477 added = True 478 479 if added: 480 bisect.insort(self.handlers, handler) 481 handler.add_parent(self) 482 483 def close(self): 484 # Only exists for backwards compatibility. 485 pass 486 487 def _call_chain(self, chain, kind, meth_name, *args): 488 # Handlers raise an exception if no one else should try to handle 489 # the request, or return None if they can't but another handler 490 # could. Otherwise, they return the response. 491 handlers = chain.get(kind, ()) 492 for handler in handlers: 493 func = getattr(handler, meth_name) 494 result = func(*args) 495 if result is not None: 496 return result 497 498 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 499 # accept a URL or a Request object 500 if isinstance(fullurl, str): 501 req = Request(fullurl, data) 502 else: 503 req = fullurl 504 if data is not None: 505 req.data = data 506 507 req.timeout = timeout 508 protocol = req.type 509 510 # pre-process request 511 meth_name = protocol+"_request" 512 for processor in self.process_request.get(protocol, []): 513 meth = getattr(processor, meth_name) 514 req = meth(req) 515 516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) 517 response = self._open(req, data) 518 519 # post-process response 520 meth_name = protocol+"_response" 521 for processor in self.process_response.get(protocol, []): 522 meth = getattr(processor, meth_name) 523 response = meth(req, response) 524 525 return response 526 527 def _open(self, req, data=None): 528 result = self._call_chain(self.handle_open, 'default', 529 'default_open', req) 530 if result: 531 return result 532 533 protocol = req.type 534 result = self._call_chain(self.handle_open, protocol, protocol + 535 '_open', req) 536 if result: 537 return result 538 539 return self._call_chain(self.handle_open, 'unknown', 540 'unknown_open', req) 541 542 def error(self, proto, *args): 543 if proto in ('http', 'https'): 544 # XXX http[s] protocols are special-cased 545 dict = self.handle_error['http'] # https is not different than http 546 proto = args[2] # YUCK! 547 meth_name = 'http_error_%s' % proto 548 http_err = 1 549 orig_args = args 550 else: 551 dict = self.handle_error 552 meth_name = proto + '_error' 553 http_err = 0 554 args = (dict, proto, meth_name) + args 555 result = self._call_chain(*args) 556 if result: 557 return result 558 559 if http_err: 560 args = (dict, 'default', 'http_error_default') + orig_args 561 return self._call_chain(*args) 562 563# XXX probably also want an abstract factory that knows when it makes 564# sense to skip a superclass in favor of a subclass and when it might 565# make sense to include both 566 567def build_opener(*handlers): 568 """Create an opener object from a list of handlers. 569 570 The opener will use several default handlers, including support 571 for HTTP, FTP and when applicable HTTPS. 572 573 If any of the handlers passed as arguments are subclasses of the 574 default handlers, the default handlers will not be used. 575 """ 576 opener = OpenerDirector() 577 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 578 HTTPDefaultErrorHandler, HTTPRedirectHandler, 579 FTPHandler, FileHandler, HTTPErrorProcessor, 580 DataHandler] 581 if hasattr(http.client, "HTTPSConnection"): 582 default_classes.append(HTTPSHandler) 583 skip = set() 584 for klass in default_classes: 585 for check in handlers: 586 if isinstance(check, type): 587 if issubclass(check, klass): 588 skip.add(klass) 589 elif isinstance(check, klass): 590 skip.add(klass) 591 for klass in skip: 592 default_classes.remove(klass) 593 594 for klass in default_classes: 595 opener.add_handler(klass()) 596 597 for h in handlers: 598 if isinstance(h, type): 599 h = h() 600 opener.add_handler(h) 601 return opener 602 603class BaseHandler: 604 handler_order = 500 605 606 def add_parent(self, parent): 607 self.parent = parent 608 609 def close(self): 610 # Only exists for backwards compatibility 611 pass 612 613 def __lt__(self, other): 614 if not hasattr(other, "handler_order"): 615 # Try to preserve the old behavior of having custom classes 616 # inserted after default ones (works only for custom user 617 # classes which are not aware of handler_order). 618 return True 619 return self.handler_order < other.handler_order 620 621 622class HTTPErrorProcessor(BaseHandler): 623 """Process HTTP error responses.""" 624 handler_order = 1000 # after all other processing 625 626 def http_response(self, request, response): 627 code, msg, hdrs = response.code, response.msg, response.info() 628 629 # According to RFC 2616, "2xx" code indicates that the client's 630 # request was successfully received, understood, and accepted. 631 if not (200 <= code < 300): 632 response = self.parent.error( 633 'http', request, response, code, msg, hdrs) 634 635 return response 636 637 https_response = http_response 638 639class HTTPDefaultErrorHandler(BaseHandler): 640 def http_error_default(self, req, fp, code, msg, hdrs): 641 raise HTTPError(req.full_url, code, msg, hdrs, fp) 642 643class HTTPRedirectHandler(BaseHandler): 644 # maximum number of redirections to any single URL 645 # this is needed because of the state that cookies introduce 646 max_repeats = 4 647 # maximum total number of redirections (regardless of URL) before 648 # assuming we're in a loop 649 max_redirections = 10 650 651 def redirect_request(self, req, fp, code, msg, headers, newurl): 652 """Return a Request or None in response to a redirect. 653 654 This is called by the http_error_30x methods when a 655 redirection response is received. If a redirection should 656 take place, return a new Request to allow http_error_30x to 657 perform the redirect. Otherwise, raise HTTPError if no-one 658 else should try to handle this url. Return None if you can't 659 but another Handler might. 660 """ 661 m = req.get_method() 662 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 663 or code in (301, 302, 303) and m == "POST")): 664 raise HTTPError(req.full_url, code, msg, headers, fp) 665 666 # Strictly (according to RFC 2616), 301 or 302 in response to 667 # a POST MUST NOT cause a redirection without confirmation 668 # from the user (of urllib.request, in this case). In practice, 669 # essentially all clients do redirect in this case, so we do 670 # the same. 671 672 # Be conciliant with URIs containing a space. This is mainly 673 # redundant with the more complete encoding done in http_error_302(), 674 # but it is kept for compatibility with other callers. 675 newurl = newurl.replace(' ', '%20') 676 677 CONTENT_HEADERS = ("content-length", "content-type") 678 newheaders = {k: v for k, v in req.headers.items() 679 if k.lower() not in CONTENT_HEADERS} 680 return Request(newurl, 681 headers=newheaders, 682 origin_req_host=req.origin_req_host, 683 unverifiable=True) 684 685 # Implementation note: To avoid the server sending us into an 686 # infinite loop, the request object needs to track what URLs we 687 # have already seen. Do this by adding a handler-specific 688 # attribute to the Request object. 689 def http_error_302(self, req, fp, code, msg, headers): 690 # Some servers (incorrectly) return multiple Location headers 691 # (so probably same goes for URI). Use first header. 692 if "location" in headers: 693 newurl = headers["location"] 694 elif "uri" in headers: 695 newurl = headers["uri"] 696 else: 697 return 698 699 # fix a possible malformed URL 700 urlparts = urlparse(newurl) 701 702 # For security reasons we don't allow redirection to anything other 703 # than http, https or ftp. 704 705 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 706 raise HTTPError( 707 newurl, code, 708 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 709 headers, fp) 710 711 if not urlparts.path and urlparts.netloc: 712 urlparts = list(urlparts) 713 urlparts[2] = "/" 714 newurl = urlunparse(urlparts) 715 716 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 717 # original bytes and percent-encode non-ASCII bytes, and any special 718 # characters such as the space. 719 newurl = quote( 720 newurl, encoding="iso-8859-1", safe=string.punctuation) 721 newurl = urljoin(req.full_url, newurl) 722 723 # XXX Probably want to forget about the state of the current 724 # request, although that might interact poorly with other 725 # handlers that also use handler-specific request attributes 726 new = self.redirect_request(req, fp, code, msg, headers, newurl) 727 if new is None: 728 return 729 730 # loop detection 731 # .redirect_dict has a key url if url was previously visited. 732 if hasattr(req, 'redirect_dict'): 733 visited = new.redirect_dict = req.redirect_dict 734 if (visited.get(newurl, 0) >= self.max_repeats or 735 len(visited) >= self.max_redirections): 736 raise HTTPError(req.full_url, code, 737 self.inf_msg + msg, headers, fp) 738 else: 739 visited = new.redirect_dict = req.redirect_dict = {} 740 visited[newurl] = visited.get(newurl, 0) + 1 741 742 # Don't close the fp until we are sure that we won't use it 743 # with HTTPError. 744 fp.read() 745 fp.close() 746 747 return self.parent.open(new, timeout=req.timeout) 748 749 http_error_301 = http_error_303 = http_error_307 = http_error_302 750 751 inf_msg = "The HTTP server returned a redirect error that would " \ 752 "lead to an infinite loop.\n" \ 753 "The last 30x error message was:\n" 754 755 756def _parse_proxy(proxy): 757 """Return (scheme, user, password, host/port) given a URL or an authority. 758 759 If a URL is supplied, it must have an authority (host:port) component. 760 According to RFC 3986, having an authority component means the URL must 761 have two slashes after the scheme. 762 """ 763 scheme, r_scheme = _splittype(proxy) 764 if not r_scheme.startswith("/"): 765 # authority 766 scheme = None 767 authority = proxy 768 else: 769 # URL 770 if not r_scheme.startswith("//"): 771 raise ValueError("proxy URL with no authority: %r" % proxy) 772 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 773 # and 3.3.), path is empty or starts with '/' 774 if '@' in r_scheme: 775 host_separator = r_scheme.find('@') 776 end = r_scheme.find("/", host_separator) 777 else: 778 end = r_scheme.find("/", 2) 779 if end == -1: 780 end = None 781 authority = r_scheme[2:end] 782 userinfo, hostport = _splituser(authority) 783 if userinfo is not None: 784 user, password = _splitpasswd(userinfo) 785 else: 786 user = password = None 787 return scheme, user, password, hostport 788 789class ProxyHandler(BaseHandler): 790 # Proxies must be in front 791 handler_order = 100 792 793 def __init__(self, proxies=None): 794 if proxies is None: 795 proxies = getproxies() 796 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 797 self.proxies = proxies 798 for type, url in proxies.items(): 799 type = type.lower() 800 setattr(self, '%s_open' % type, 801 lambda r, proxy=url, type=type, meth=self.proxy_open: 802 meth(r, proxy, type)) 803 804 def proxy_open(self, req, proxy, type): 805 orig_type = req.type 806 proxy_type, user, password, hostport = _parse_proxy(proxy) 807 if proxy_type is None: 808 proxy_type = orig_type 809 810 if req.host and proxy_bypass(req.host): 811 return None 812 813 if user and password: 814 user_pass = '%s:%s' % (unquote(user), 815 unquote(password)) 816 creds = base64.b64encode(user_pass.encode()).decode("ascii") 817 req.add_header('Proxy-authorization', 'Basic ' + creds) 818 hostport = unquote(hostport) 819 req.set_proxy(hostport, proxy_type) 820 if orig_type == proxy_type or orig_type == 'https': 821 # let other handlers take care of it 822 return None 823 else: 824 # need to start over, because the other handlers don't 825 # grok the proxy's URL type 826 # e.g. if we have a constructor arg proxies like so: 827 # {'http': 'ftp://proxy.example.com'}, we may end up turning 828 # a request for http://acme.example.com/a into one for 829 # ftp://proxy.example.com/a 830 return self.parent.open(req, timeout=req.timeout) 831 832class HTTPPasswordMgr: 833 834 def __init__(self): 835 self.passwd = {} 836 837 def add_password(self, realm, uri, user, passwd): 838 # uri could be a single URI or a sequence 839 if isinstance(uri, str): 840 uri = [uri] 841 if realm not in self.passwd: 842 self.passwd[realm] = {} 843 for default_port in True, False: 844 reduced_uri = tuple( 845 self.reduce_uri(u, default_port) for u in uri) 846 self.passwd[realm][reduced_uri] = (user, passwd) 847 848 def find_user_password(self, realm, authuri): 849 domains = self.passwd.get(realm, {}) 850 for default_port in True, False: 851 reduced_authuri = self.reduce_uri(authuri, default_port) 852 for uris, authinfo in domains.items(): 853 for uri in uris: 854 if self.is_suburi(uri, reduced_authuri): 855 return authinfo 856 return None, None 857 858 def reduce_uri(self, uri, default_port=True): 859 """Accept authority or URI and extract only the authority and path.""" 860 # note HTTP URLs do not have a userinfo component 861 parts = urlsplit(uri) 862 if parts[1]: 863 # URI 864 scheme = parts[0] 865 authority = parts[1] 866 path = parts[2] or '/' 867 else: 868 # host or host:port 869 scheme = None 870 authority = uri 871 path = '/' 872 host, port = _splitport(authority) 873 if default_port and port is None and scheme is not None: 874 dport = {"http": 80, 875 "https": 443, 876 }.get(scheme) 877 if dport is not None: 878 authority = "%s:%d" % (host, dport) 879 return authority, path 880 881 def is_suburi(self, base, test): 882 """Check if test is below base in a URI tree 883 884 Both args must be URIs in reduced form. 885 """ 886 if base == test: 887 return True 888 if base[0] != test[0]: 889 return False 890 common = posixpath.commonprefix((base[1], test[1])) 891 if len(common) == len(base[1]): 892 return True 893 return False 894 895 896class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 897 898 def find_user_password(self, realm, authuri): 899 user, password = HTTPPasswordMgr.find_user_password(self, realm, 900 authuri) 901 if user is not None: 902 return user, password 903 return HTTPPasswordMgr.find_user_password(self, None, authuri) 904 905 906class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 907 908 def __init__(self, *args, **kwargs): 909 self.authenticated = {} 910 super().__init__(*args, **kwargs) 911 912 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 913 self.update_authenticated(uri, is_authenticated) 914 # Add a default for prior auth requests 915 if realm is not None: 916 super().add_password(None, uri, user, passwd) 917 super().add_password(realm, uri, user, passwd) 918 919 def update_authenticated(self, uri, is_authenticated=False): 920 # uri could be a single URI or a sequence 921 if isinstance(uri, str): 922 uri = [uri] 923 924 for default_port in True, False: 925 for u in uri: 926 reduced_uri = self.reduce_uri(u, default_port) 927 self.authenticated[reduced_uri] = is_authenticated 928 929 def is_authenticated(self, authuri): 930 for default_port in True, False: 931 reduced_authuri = self.reduce_uri(authuri, default_port) 932 for uri in self.authenticated: 933 if self.is_suburi(uri, reduced_authuri): 934 return self.authenticated[uri] 935 936 937class AbstractBasicAuthHandler: 938 939 # XXX this allows for multiple auth-schemes, but will stupidly pick 940 # the last one with a realm specified. 941 942 # allow for double- and single-quoted realm values 943 # (single quotes are a violation of the RFC, but appear in the wild) 944 rx = re.compile('(?:^|,)' # start of the string or ',' 945 '[ \t]*' # optional whitespaces 946 '([^ \t,]+)' # scheme like "Basic" 947 '[ \t]+' # mandatory whitespaces 948 # realm=xxx 949 # realm='xxx' 950 # realm="xxx" 951 'realm=(["\']?)([^"\']*)\\2', 952 re.I) 953 954 # XXX could pre-emptively send auth info already accepted (RFC 2617, 955 # end of section 2, and section 1.2 immediately after "credentials" 956 # production). 957 958 def __init__(self, password_mgr=None): 959 if password_mgr is None: 960 password_mgr = HTTPPasswordMgr() 961 self.passwd = password_mgr 962 self.add_password = self.passwd.add_password 963 964 def _parse_realm(self, header): 965 # parse WWW-Authenticate header: accept multiple challenges per header 966 found_challenge = False 967 for mo in AbstractBasicAuthHandler.rx.finditer(header): 968 scheme, quote, realm = mo.groups() 969 if quote not in ['"', "'"]: 970 warnings.warn("Basic Auth Realm was unquoted", 971 UserWarning, 3) 972 973 yield (scheme, realm) 974 975 found_challenge = True 976 977 if not found_challenge: 978 if header: 979 scheme = header.split()[0] 980 else: 981 scheme = '' 982 yield (scheme, None) 983 984 def http_error_auth_reqed(self, authreq, host, req, headers): 985 # host may be an authority (without userinfo) or a URL with an 986 # authority 987 headers = headers.get_all(authreq) 988 if not headers: 989 # no header found 990 return 991 992 unsupported = None 993 for header in headers: 994 for scheme, realm in self._parse_realm(header): 995 if scheme.lower() != 'basic': 996 unsupported = scheme 997 continue 998 999 if realm is not None: 1000 # Use the first matching Basic challenge. 1001 # Ignore following challenges even if they use the Basic 1002 # scheme. 1003 return self.retry_http_basic_auth(host, req, realm) 1004 1005 if unsupported is not None: 1006 raise ValueError("AbstractBasicAuthHandler does not " 1007 "support the following scheme: %r" 1008 % (scheme,)) 1009 1010 def retry_http_basic_auth(self, host, req, realm): 1011 user, pw = self.passwd.find_user_password(realm, host) 1012 if pw is not None: 1013 raw = "%s:%s" % (user, pw) 1014 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 1015 if req.get_header(self.auth_header, None) == auth: 1016 return None 1017 req.add_unredirected_header(self.auth_header, auth) 1018 return self.parent.open(req, timeout=req.timeout) 1019 else: 1020 return None 1021 1022 def http_request(self, req): 1023 if (not hasattr(self.passwd, 'is_authenticated') or 1024 not self.passwd.is_authenticated(req.full_url)): 1025 return req 1026 1027 if not req.has_header('Authorization'): 1028 user, passwd = self.passwd.find_user_password(None, req.full_url) 1029 credentials = '{0}:{1}'.format(user, passwd).encode() 1030 auth_str = base64.standard_b64encode(credentials).decode() 1031 req.add_unredirected_header('Authorization', 1032 'Basic {}'.format(auth_str.strip())) 1033 return req 1034 1035 def http_response(self, req, response): 1036 if hasattr(self.passwd, 'is_authenticated'): 1037 if 200 <= response.code < 300: 1038 self.passwd.update_authenticated(req.full_url, True) 1039 else: 1040 self.passwd.update_authenticated(req.full_url, False) 1041 return response 1042 1043 https_request = http_request 1044 https_response = http_response 1045 1046 1047 1048class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1049 1050 auth_header = 'Authorization' 1051 1052 def http_error_401(self, req, fp, code, msg, headers): 1053 url = req.full_url 1054 response = self.http_error_auth_reqed('www-authenticate', 1055 url, req, headers) 1056 return response 1057 1058 1059class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1060 1061 auth_header = 'Proxy-authorization' 1062 1063 def http_error_407(self, req, fp, code, msg, headers): 1064 # http_error_auth_reqed requires that there is no userinfo component in 1065 # authority. Assume there isn't one, since urllib.request does not (and 1066 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1067 # userinfo. 1068 authority = req.host 1069 response = self.http_error_auth_reqed('proxy-authenticate', 1070 authority, req, headers) 1071 return response 1072 1073 1074# Return n random bytes. 1075_randombytes = os.urandom 1076 1077 1078class AbstractDigestAuthHandler: 1079 # Digest authentication is specified in RFC 2617. 1080 1081 # XXX The client does not inspect the Authentication-Info header 1082 # in a successful response. 1083 1084 # XXX It should be possible to test this implementation against 1085 # a mock server that just generates a static set of challenges. 1086 1087 # XXX qop="auth-int" supports is shaky 1088 1089 def __init__(self, passwd=None): 1090 if passwd is None: 1091 passwd = HTTPPasswordMgr() 1092 self.passwd = passwd 1093 self.add_password = self.passwd.add_password 1094 self.retried = 0 1095 self.nonce_count = 0 1096 self.last_nonce = None 1097 1098 def reset_retry_count(self): 1099 self.retried = 0 1100 1101 def http_error_auth_reqed(self, auth_header, host, req, headers): 1102 authreq = headers.get(auth_header, None) 1103 if self.retried > 5: 1104 # Don't fail endlessly - if we failed once, we'll probably 1105 # fail a second time. Hm. Unless the Password Manager is 1106 # prompting for the information. Crap. This isn't great 1107 # but it's better than the current 'repeat until recursion 1108 # depth exceeded' approach <wink> 1109 raise HTTPError(req.full_url, 401, "digest auth failed", 1110 headers, None) 1111 else: 1112 self.retried += 1 1113 if authreq: 1114 scheme = authreq.split()[0] 1115 if scheme.lower() == 'digest': 1116 return self.retry_http_digest_auth(req, authreq) 1117 elif scheme.lower() != 'basic': 1118 raise ValueError("AbstractDigestAuthHandler does not support" 1119 " the following scheme: '%s'" % scheme) 1120 1121 def retry_http_digest_auth(self, req, auth): 1122 token, challenge = auth.split(' ', 1) 1123 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1124 auth = self.get_authorization(req, chal) 1125 if auth: 1126 auth_val = 'Digest %s' % auth 1127 if req.headers.get(self.auth_header, None) == auth_val: 1128 return None 1129 req.add_unredirected_header(self.auth_header, auth_val) 1130 resp = self.parent.open(req, timeout=req.timeout) 1131 return resp 1132 1133 def get_cnonce(self, nonce): 1134 # The cnonce-value is an opaque 1135 # quoted string value provided by the client and used by both client 1136 # and server to avoid chosen plaintext attacks, to provide mutual 1137 # authentication, and to provide some message integrity protection. 1138 # This isn't a fabulous effort, but it's probably Good Enough. 1139 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1140 b = s.encode("ascii") + _randombytes(8) 1141 dig = hashlib.sha1(b).hexdigest() 1142 return dig[:16] 1143 1144 def get_authorization(self, req, chal): 1145 try: 1146 realm = chal['realm'] 1147 nonce = chal['nonce'] 1148 qop = chal.get('qop') 1149 algorithm = chal.get('algorithm', 'MD5') 1150 # mod_digest doesn't send an opaque, even though it isn't 1151 # supposed to be optional 1152 opaque = chal.get('opaque', None) 1153 except KeyError: 1154 return None 1155 1156 H, KD = self.get_algorithm_impls(algorithm) 1157 if H is None: 1158 return None 1159 1160 user, pw = self.passwd.find_user_password(realm, req.full_url) 1161 if user is None: 1162 return None 1163 1164 # XXX not implemented yet 1165 if req.data is not None: 1166 entdig = self.get_entity_digest(req.data, chal) 1167 else: 1168 entdig = None 1169 1170 A1 = "%s:%s:%s" % (user, realm, pw) 1171 A2 = "%s:%s" % (req.get_method(), 1172 # XXX selector: what about proxies and full urls 1173 req.selector) 1174 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth` 1175 # or `auth-int` to the response back. we use `auth` to send the response back. 1176 if qop is None: 1177 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1178 elif 'auth' in qop.split(','): 1179 if nonce == self.last_nonce: 1180 self.nonce_count += 1 1181 else: 1182 self.nonce_count = 1 1183 self.last_nonce = nonce 1184 ncvalue = '%08x' % self.nonce_count 1185 cnonce = self.get_cnonce(nonce) 1186 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2)) 1187 respdig = KD(H(A1), noncebit) 1188 else: 1189 # XXX handle auth-int. 1190 raise URLError("qop '%s' is not supported." % qop) 1191 1192 # XXX should the partial digests be encoded too? 1193 1194 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1195 'response="%s"' % (user, realm, nonce, req.selector, 1196 respdig) 1197 if opaque: 1198 base += ', opaque="%s"' % opaque 1199 if entdig: 1200 base += ', digest="%s"' % entdig 1201 base += ', algorithm="%s"' % algorithm 1202 if qop: 1203 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1204 return base 1205 1206 def get_algorithm_impls(self, algorithm): 1207 # lambdas assume digest modules are imported at the top level 1208 if algorithm == 'MD5': 1209 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1210 elif algorithm == 'SHA': 1211 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1212 # XXX MD5-sess 1213 else: 1214 raise ValueError("Unsupported digest authentication " 1215 "algorithm %r" % algorithm) 1216 KD = lambda s, d: H("%s:%s" % (s, d)) 1217 return H, KD 1218 1219 def get_entity_digest(self, data, chal): 1220 # XXX not implemented yet 1221 return None 1222 1223 1224class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1225 """An authentication protocol defined by RFC 2069 1226 1227 Digest authentication improves on basic authentication because it 1228 does not transmit passwords in the clear. 1229 """ 1230 1231 auth_header = 'Authorization' 1232 handler_order = 490 # before Basic auth 1233 1234 def http_error_401(self, req, fp, code, msg, headers): 1235 host = urlparse(req.full_url)[1] 1236 retry = self.http_error_auth_reqed('www-authenticate', 1237 host, req, headers) 1238 self.reset_retry_count() 1239 return retry 1240 1241 1242class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1243 1244 auth_header = 'Proxy-Authorization' 1245 handler_order = 490 # before Basic auth 1246 1247 def http_error_407(self, req, fp, code, msg, headers): 1248 host = req.host 1249 retry = self.http_error_auth_reqed('proxy-authenticate', 1250 host, req, headers) 1251 self.reset_retry_count() 1252 return retry 1253 1254class AbstractHTTPHandler(BaseHandler): 1255 1256 def __init__(self, debuglevel=0): 1257 self._debuglevel = debuglevel 1258 1259 def set_http_debuglevel(self, level): 1260 self._debuglevel = level 1261 1262 def _get_content_length(self, request): 1263 return http.client.HTTPConnection._get_content_length( 1264 request.data, 1265 request.get_method()) 1266 1267 def do_request_(self, request): 1268 host = request.host 1269 if not host: 1270 raise URLError('no host given') 1271 1272 if request.data is not None: # POST 1273 data = request.data 1274 if isinstance(data, str): 1275 msg = "POST data should be bytes, an iterable of bytes, " \ 1276 "or a file object. It cannot be of type str." 1277 raise TypeError(msg) 1278 if not request.has_header('Content-type'): 1279 request.add_unredirected_header( 1280 'Content-type', 1281 'application/x-www-form-urlencoded') 1282 if (not request.has_header('Content-length') 1283 and not request.has_header('Transfer-encoding')): 1284 content_length = self._get_content_length(request) 1285 if content_length is not None: 1286 request.add_unredirected_header( 1287 'Content-length', str(content_length)) 1288 else: 1289 request.add_unredirected_header( 1290 'Transfer-encoding', 'chunked') 1291 1292 sel_host = host 1293 if request.has_proxy(): 1294 scheme, sel = _splittype(request.selector) 1295 sel_host, sel_path = _splithost(sel) 1296 if not request.has_header('Host'): 1297 request.add_unredirected_header('Host', sel_host) 1298 for name, value in self.parent.addheaders: 1299 name = name.capitalize() 1300 if not request.has_header(name): 1301 request.add_unredirected_header(name, value) 1302 1303 return request 1304 1305 def do_open(self, http_class, req, **http_conn_args): 1306 """Return an HTTPResponse object for the request, using http_class. 1307 1308 http_class must implement the HTTPConnection API from http.client. 1309 """ 1310 host = req.host 1311 if not host: 1312 raise URLError('no host given') 1313 1314 # will parse host:port 1315 h = http_class(host, timeout=req.timeout, **http_conn_args) 1316 h.set_debuglevel(self._debuglevel) 1317 1318 headers = dict(req.unredirected_hdrs) 1319 headers.update({k: v for k, v in req.headers.items() 1320 if k not in headers}) 1321 1322 # TODO(jhylton): Should this be redesigned to handle 1323 # persistent connections? 1324 1325 # We want to make an HTTP/1.1 request, but the addinfourl 1326 # class isn't prepared to deal with a persistent connection. 1327 # It will try to read all remaining data from the socket, 1328 # which will block while the server waits for the next request. 1329 # So make sure the connection gets closed after the (only) 1330 # request. 1331 headers["Connection"] = "close" 1332 headers = {name.title(): val for name, val in headers.items()} 1333 1334 if req._tunnel_host: 1335 tunnel_headers = {} 1336 proxy_auth_hdr = "Proxy-Authorization" 1337 if proxy_auth_hdr in headers: 1338 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1339 # Proxy-Authorization should not be sent to origin 1340 # server. 1341 del headers[proxy_auth_hdr] 1342 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1343 1344 try: 1345 try: 1346 h.request(req.get_method(), req.selector, req.data, headers, 1347 encode_chunked=req.has_header('Transfer-encoding')) 1348 except OSError as err: # timeout error 1349 raise URLError(err) 1350 r = h.getresponse() 1351 except: 1352 h.close() 1353 raise 1354 1355 # If the server does not send us a 'Connection: close' header, 1356 # HTTPConnection assumes the socket should be left open. Manually 1357 # mark the socket to be closed when this response object goes away. 1358 if h.sock: 1359 h.sock.close() 1360 h.sock = None 1361 1362 r.url = req.get_full_url() 1363 # This line replaces the .msg attribute of the HTTPResponse 1364 # with .headers, because urllib clients expect the response to 1365 # have the reason in .msg. It would be good to mark this 1366 # attribute is deprecated and get then to use info() or 1367 # .headers. 1368 r.msg = r.reason 1369 return r 1370 1371 1372class HTTPHandler(AbstractHTTPHandler): 1373 1374 def http_open(self, req): 1375 return self.do_open(http.client.HTTPConnection, req) 1376 1377 http_request = AbstractHTTPHandler.do_request_ 1378 1379if hasattr(http.client, 'HTTPSConnection'): 1380 1381 class HTTPSHandler(AbstractHTTPHandler): 1382 1383 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1384 AbstractHTTPHandler.__init__(self, debuglevel) 1385 self._context = context 1386 self._check_hostname = check_hostname 1387 1388 def https_open(self, req): 1389 return self.do_open(http.client.HTTPSConnection, req, 1390 context=self._context, check_hostname=self._check_hostname) 1391 1392 https_request = AbstractHTTPHandler.do_request_ 1393 1394 __all__.append('HTTPSHandler') 1395 1396class HTTPCookieProcessor(BaseHandler): 1397 def __init__(self, cookiejar=None): 1398 import http.cookiejar 1399 if cookiejar is None: 1400 cookiejar = http.cookiejar.CookieJar() 1401 self.cookiejar = cookiejar 1402 1403 def http_request(self, request): 1404 self.cookiejar.add_cookie_header(request) 1405 return request 1406 1407 def http_response(self, request, response): 1408 self.cookiejar.extract_cookies(response, request) 1409 return response 1410 1411 https_request = http_request 1412 https_response = http_response 1413 1414class UnknownHandler(BaseHandler): 1415 def unknown_open(self, req): 1416 type = req.type 1417 raise URLError('unknown url type: %s' % type) 1418 1419def parse_keqv_list(l): 1420 """Parse list of key=value strings where keys are not duplicated.""" 1421 parsed = {} 1422 for elt in l: 1423 k, v = elt.split('=', 1) 1424 if v[0] == '"' and v[-1] == '"': 1425 v = v[1:-1] 1426 parsed[k] = v 1427 return parsed 1428 1429def parse_http_list(s): 1430 """Parse lists as described by RFC 2068 Section 2. 1431 1432 In particular, parse comma-separated lists where the elements of 1433 the list may include quoted-strings. A quoted-string could 1434 contain a comma. A non-quoted string could have quotes in the 1435 middle. Neither commas nor quotes count if they are escaped. 1436 Only double-quotes count, not single-quotes. 1437 """ 1438 res = [] 1439 part = '' 1440 1441 escape = quote = False 1442 for cur in s: 1443 if escape: 1444 part += cur 1445 escape = False 1446 continue 1447 if quote: 1448 if cur == '\\': 1449 escape = True 1450 continue 1451 elif cur == '"': 1452 quote = False 1453 part += cur 1454 continue 1455 1456 if cur == ',': 1457 res.append(part) 1458 part = '' 1459 continue 1460 1461 if cur == '"': 1462 quote = True 1463 1464 part += cur 1465 1466 # append last part 1467 if part: 1468 res.append(part) 1469 1470 return [part.strip() for part in res] 1471 1472class FileHandler(BaseHandler): 1473 # Use local file or FTP depending on form of URL 1474 def file_open(self, req): 1475 url = req.selector 1476 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1477 req.host != 'localhost'): 1478 if not req.host in self.get_names(): 1479 raise URLError("file:// scheme is supported only on localhost") 1480 else: 1481 return self.open_local_file(req) 1482 1483 # names for the localhost 1484 names = None 1485 def get_names(self): 1486 if FileHandler.names is None: 1487 try: 1488 FileHandler.names = tuple( 1489 socket.gethostbyname_ex('localhost')[2] + 1490 socket.gethostbyname_ex(socket.gethostname())[2]) 1491 except socket.gaierror: 1492 FileHandler.names = (socket.gethostbyname('localhost'),) 1493 return FileHandler.names 1494 1495 # not entirely sure what the rules are here 1496 def open_local_file(self, req): 1497 import email.utils 1498 import mimetypes 1499 host = req.host 1500 filename = req.selector 1501 localfile = url2pathname(filename) 1502 try: 1503 stats = os.stat(localfile) 1504 size = stats.st_size 1505 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1506 mtype = mimetypes.guess_type(filename)[0] 1507 headers = email.message_from_string( 1508 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1509 (mtype or 'text/plain', size, modified)) 1510 if host: 1511 host, port = _splitport(host) 1512 if not host or \ 1513 (not port and _safe_gethostbyname(host) in self.get_names()): 1514 if host: 1515 origurl = 'file://' + host + filename 1516 else: 1517 origurl = 'file://' + filename 1518 return addinfourl(open(localfile, 'rb'), headers, origurl) 1519 except OSError as exp: 1520 raise URLError(exp) 1521 raise URLError('file not on local host') 1522 1523def _safe_gethostbyname(host): 1524 try: 1525 return socket.gethostbyname(host) 1526 except socket.gaierror: 1527 return None 1528 1529class FTPHandler(BaseHandler): 1530 def ftp_open(self, req): 1531 import ftplib 1532 import mimetypes 1533 host = req.host 1534 if not host: 1535 raise URLError('ftp error: no host given') 1536 host, port = _splitport(host) 1537 if port is None: 1538 port = ftplib.FTP_PORT 1539 else: 1540 port = int(port) 1541 1542 # username/password handling 1543 user, host = _splituser(host) 1544 if user: 1545 user, passwd = _splitpasswd(user) 1546 else: 1547 passwd = None 1548 host = unquote(host) 1549 user = user or '' 1550 passwd = passwd or '' 1551 1552 try: 1553 host = socket.gethostbyname(host) 1554 except OSError as msg: 1555 raise URLError(msg) 1556 path, attrs = _splitattr(req.selector) 1557 dirs = path.split('/') 1558 dirs = list(map(unquote, dirs)) 1559 dirs, file = dirs[:-1], dirs[-1] 1560 if dirs and not dirs[0]: 1561 dirs = dirs[1:] 1562 try: 1563 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1564 type = file and 'I' or 'D' 1565 for attr in attrs: 1566 attr, value = _splitvalue(attr) 1567 if attr.lower() == 'type' and \ 1568 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1569 type = value.upper() 1570 fp, retrlen = fw.retrfile(file, type) 1571 headers = "" 1572 mtype = mimetypes.guess_type(req.full_url)[0] 1573 if mtype: 1574 headers += "Content-type: %s\n" % mtype 1575 if retrlen is not None and retrlen >= 0: 1576 headers += "Content-length: %d\n" % retrlen 1577 headers = email.message_from_string(headers) 1578 return addinfourl(fp, headers, req.full_url) 1579 except ftplib.all_errors as exp: 1580 exc = URLError('ftp error: %r' % exp) 1581 raise exc.with_traceback(sys.exc_info()[2]) 1582 1583 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1584 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1585 persistent=False) 1586 1587class CacheFTPHandler(FTPHandler): 1588 # XXX would be nice to have pluggable cache strategies 1589 # XXX this stuff is definitely not thread safe 1590 def __init__(self): 1591 self.cache = {} 1592 self.timeout = {} 1593 self.soonest = 0 1594 self.delay = 60 1595 self.max_conns = 16 1596 1597 def setTimeout(self, t): 1598 self.delay = t 1599 1600 def setMaxConns(self, m): 1601 self.max_conns = m 1602 1603 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1604 key = user, host, port, '/'.join(dirs), timeout 1605 if key in self.cache: 1606 self.timeout[key] = time.time() + self.delay 1607 else: 1608 self.cache[key] = ftpwrapper(user, passwd, host, port, 1609 dirs, timeout) 1610 self.timeout[key] = time.time() + self.delay 1611 self.check_cache() 1612 return self.cache[key] 1613 1614 def check_cache(self): 1615 # first check for old ones 1616 t = time.time() 1617 if self.soonest <= t: 1618 for k, v in list(self.timeout.items()): 1619 if v < t: 1620 self.cache[k].close() 1621 del self.cache[k] 1622 del self.timeout[k] 1623 self.soonest = min(list(self.timeout.values())) 1624 1625 # then check the size 1626 if len(self.cache) == self.max_conns: 1627 for k, v in list(self.timeout.items()): 1628 if v == self.soonest: 1629 del self.cache[k] 1630 del self.timeout[k] 1631 break 1632 self.soonest = min(list(self.timeout.values())) 1633 1634 def clear_cache(self): 1635 for conn in self.cache.values(): 1636 conn.close() 1637 self.cache.clear() 1638 self.timeout.clear() 1639 1640class DataHandler(BaseHandler): 1641 def data_open(self, req): 1642 # data URLs as specified in RFC 2397. 1643 # 1644 # ignores POSTed data 1645 # 1646 # syntax: 1647 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1648 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1649 # data := *urlchar 1650 # parameter := attribute "=" value 1651 url = req.full_url 1652 1653 scheme, data = url.split(":",1) 1654 mediatype, data = data.split(",",1) 1655 1656 # even base64 encoded data URLs might be quoted so unquote in any case: 1657 data = unquote_to_bytes(data) 1658 if mediatype.endswith(";base64"): 1659 data = base64.decodebytes(data) 1660 mediatype = mediatype[:-7] 1661 1662 if not mediatype: 1663 mediatype = "text/plain;charset=US-ASCII" 1664 1665 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1666 (mediatype, len(data))) 1667 1668 return addinfourl(io.BytesIO(data), headers, url) 1669 1670 1671# Code move from the old urllib module 1672 1673MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1674 1675# Helper for non-unix systems 1676if os.name == 'nt': 1677 from nturl2path import url2pathname, pathname2url 1678else: 1679 def url2pathname(pathname): 1680 """OS-specific conversion from a relative URL of the 'file' scheme 1681 to a file system path; not recommended for general use.""" 1682 return unquote(pathname) 1683 1684 def pathname2url(pathname): 1685 """OS-specific conversion from a file system path to a relative URL 1686 of the 'file' scheme; not recommended for general use.""" 1687 return quote(pathname) 1688 1689 1690ftpcache = {} 1691 1692 1693class URLopener: 1694 """Class to open URLs. 1695 This is a class rather than just a subroutine because we may need 1696 more than one set of global protocol-specific options. 1697 Note -- this is a base class for those who don't want the 1698 automatic handling of errors type 302 (relocated) and 401 1699 (authorization needed).""" 1700 1701 __tempfiles = None 1702 1703 version = "Python-urllib/%s" % __version__ 1704 1705 # Constructor 1706 def __init__(self, proxies=None, **x509): 1707 msg = "%(class)s style of invoking requests is deprecated. " \ 1708 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1709 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1710 if proxies is None: 1711 proxies = getproxies() 1712 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1713 self.proxies = proxies 1714 self.key_file = x509.get('key_file') 1715 self.cert_file = x509.get('cert_file') 1716 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1717 self.__tempfiles = [] 1718 self.__unlink = os.unlink # See cleanup() 1719 self.tempcache = None 1720 # Undocumented feature: if you assign {} to tempcache, 1721 # it is used to cache files retrieved with 1722 # self.retrieve(). This is not enabled by default 1723 # since it does not work for changing documents (and I 1724 # haven't got the logic to check expiration headers 1725 # yet). 1726 self.ftpcache = ftpcache 1727 # Undocumented feature: you can use a different 1728 # ftp cache by assigning to the .ftpcache member; 1729 # in case you want logically independent URL openers 1730 # XXX This is not threadsafe. Bah. 1731 1732 def __del__(self): 1733 self.close() 1734 1735 def close(self): 1736 self.cleanup() 1737 1738 def cleanup(self): 1739 # This code sometimes runs when the rest of this module 1740 # has already been deleted, so it can't use any globals 1741 # or import anything. 1742 if self.__tempfiles: 1743 for file in self.__tempfiles: 1744 try: 1745 self.__unlink(file) 1746 except OSError: 1747 pass 1748 del self.__tempfiles[:] 1749 if self.tempcache: 1750 self.tempcache.clear() 1751 1752 def addheader(self, *args): 1753 """Add a header to be used by the HTTP interface only 1754 e.g. u.addheader('Accept', 'sound/basic')""" 1755 self.addheaders.append(args) 1756 1757 # External interface 1758 def open(self, fullurl, data=None): 1759 """Use URLopener().open(file) instead of open(file, 'r').""" 1760 fullurl = unwrap(_to_bytes(fullurl)) 1761 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1762 if self.tempcache and fullurl in self.tempcache: 1763 filename, headers = self.tempcache[fullurl] 1764 fp = open(filename, 'rb') 1765 return addinfourl(fp, headers, fullurl) 1766 urltype, url = _splittype(fullurl) 1767 if not urltype: 1768 urltype = 'file' 1769 if urltype in self.proxies: 1770 proxy = self.proxies[urltype] 1771 urltype, proxyhost = _splittype(proxy) 1772 host, selector = _splithost(proxyhost) 1773 url = (host, fullurl) # Signal special case to open_*() 1774 else: 1775 proxy = None 1776 name = 'open_' + urltype 1777 self.type = urltype 1778 name = name.replace('-', '_') 1779 if not hasattr(self, name) or name == 'open_local_file': 1780 if proxy: 1781 return self.open_unknown_proxy(proxy, fullurl, data) 1782 else: 1783 return self.open_unknown(fullurl, data) 1784 try: 1785 if data is None: 1786 return getattr(self, name)(url) 1787 else: 1788 return getattr(self, name)(url, data) 1789 except (HTTPError, URLError): 1790 raise 1791 except OSError as msg: 1792 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2]) 1793 1794 def open_unknown(self, fullurl, data=None): 1795 """Overridable interface to open unknown URL type.""" 1796 type, url = _splittype(fullurl) 1797 raise OSError('url error', 'unknown url type', type) 1798 1799 def open_unknown_proxy(self, proxy, fullurl, data=None): 1800 """Overridable interface to open unknown URL type.""" 1801 type, url = _splittype(fullurl) 1802 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1803 1804 # External interface 1805 def retrieve(self, url, filename=None, reporthook=None, data=None): 1806 """retrieve(url) returns (filename, headers) for a local object 1807 or (tempfilename, headers) for a remote object.""" 1808 url = unwrap(_to_bytes(url)) 1809 if self.tempcache and url in self.tempcache: 1810 return self.tempcache[url] 1811 type, url1 = _splittype(url) 1812 if filename is None and (not type or type == 'file'): 1813 try: 1814 fp = self.open_local_file(url1) 1815 hdrs = fp.info() 1816 fp.close() 1817 return url2pathname(_splithost(url1)[1]), hdrs 1818 except OSError: 1819 pass 1820 fp = self.open(url, data) 1821 try: 1822 headers = fp.info() 1823 if filename: 1824 tfp = open(filename, 'wb') 1825 else: 1826 garbage, path = _splittype(url) 1827 garbage, path = _splithost(path or "") 1828 path, garbage = _splitquery(path or "") 1829 path, garbage = _splitattr(path or "") 1830 suffix = os.path.splitext(path)[1] 1831 (fd, filename) = tempfile.mkstemp(suffix) 1832 self.__tempfiles.append(filename) 1833 tfp = os.fdopen(fd, 'wb') 1834 try: 1835 result = filename, headers 1836 if self.tempcache is not None: 1837 self.tempcache[url] = result 1838 bs = 1024*8 1839 size = -1 1840 read = 0 1841 blocknum = 0 1842 if "content-length" in headers: 1843 size = int(headers["Content-Length"]) 1844 if reporthook: 1845 reporthook(blocknum, bs, size) 1846 while 1: 1847 block = fp.read(bs) 1848 if not block: 1849 break 1850 read += len(block) 1851 tfp.write(block) 1852 blocknum += 1 1853 if reporthook: 1854 reporthook(blocknum, bs, size) 1855 finally: 1856 tfp.close() 1857 finally: 1858 fp.close() 1859 1860 # raise exception if actual size does not match content-length header 1861 if size >= 0 and read < size: 1862 raise ContentTooShortError( 1863 "retrieval incomplete: got only %i out of %i bytes" 1864 % (read, size), result) 1865 1866 return result 1867 1868 # Each method named open_<type> knows how to open that type of URL 1869 1870 def _open_generic_http(self, connection_factory, url, data): 1871 """Make an HTTP connection using connection_class. 1872 1873 This is an internal method that should be called from 1874 open_http() or open_https(). 1875 1876 Arguments: 1877 - connection_factory should take a host name and return an 1878 HTTPConnection instance. 1879 - url is the url to retrieval or a host, relative-path pair. 1880 - data is payload for a POST request or None. 1881 """ 1882 1883 user_passwd = None 1884 proxy_passwd= None 1885 if isinstance(url, str): 1886 host, selector = _splithost(url) 1887 if host: 1888 user_passwd, host = _splituser(host) 1889 host = unquote(host) 1890 realhost = host 1891 else: 1892 host, selector = url 1893 # check whether the proxy contains authorization information 1894 proxy_passwd, host = _splituser(host) 1895 # now we proceed with the url we want to obtain 1896 urltype, rest = _splittype(selector) 1897 url = rest 1898 user_passwd = None 1899 if urltype.lower() != 'http': 1900 realhost = None 1901 else: 1902 realhost, rest = _splithost(rest) 1903 if realhost: 1904 user_passwd, realhost = _splituser(realhost) 1905 if user_passwd: 1906 selector = "%s://%s%s" % (urltype, realhost, rest) 1907 if proxy_bypass(realhost): 1908 host = realhost 1909 1910 if not host: raise OSError('http error', 'no host given') 1911 1912 if proxy_passwd: 1913 proxy_passwd = unquote(proxy_passwd) 1914 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1915 else: 1916 proxy_auth = None 1917 1918 if user_passwd: 1919 user_passwd = unquote(user_passwd) 1920 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1921 else: 1922 auth = None 1923 http_conn = connection_factory(host) 1924 headers = {} 1925 if proxy_auth: 1926 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1927 if auth: 1928 headers["Authorization"] = "Basic %s" % auth 1929 if realhost: 1930 headers["Host"] = realhost 1931 1932 # Add Connection:close as we don't support persistent connections yet. 1933 # This helps in closing the socket and avoiding ResourceWarning 1934 1935 headers["Connection"] = "close" 1936 1937 for header, value in self.addheaders: 1938 headers[header] = value 1939 1940 if data is not None: 1941 headers["Content-Type"] = "application/x-www-form-urlencoded" 1942 http_conn.request("POST", selector, data, headers) 1943 else: 1944 http_conn.request("GET", selector, headers=headers) 1945 1946 try: 1947 response = http_conn.getresponse() 1948 except http.client.BadStatusLine: 1949 # something went wrong with the HTTP status line 1950 raise URLError("http protocol error: bad status line") 1951 1952 # According to RFC 2616, "2xx" code indicates that the client's 1953 # request was successfully received, understood, and accepted. 1954 if 200 <= response.status < 300: 1955 return addinfourl(response, response.msg, "http:" + url, 1956 response.status) 1957 else: 1958 return self.http_error( 1959 url, response.fp, 1960 response.status, response.reason, response.msg, data) 1961 1962 def open_http(self, url, data=None): 1963 """Use HTTP protocol.""" 1964 return self._open_generic_http(http.client.HTTPConnection, url, data) 1965 1966 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1967 """Handle http errors. 1968 1969 Derived class can override this, or provide specific handlers 1970 named http_error_DDD where DDD is the 3-digit error code.""" 1971 # First check if there's a specific handler for this error 1972 name = 'http_error_%d' % errcode 1973 if hasattr(self, name): 1974 method = getattr(self, name) 1975 if data is None: 1976 result = method(url, fp, errcode, errmsg, headers) 1977 else: 1978 result = method(url, fp, errcode, errmsg, headers, data) 1979 if result: return result 1980 return self.http_error_default(url, fp, errcode, errmsg, headers) 1981 1982 def http_error_default(self, url, fp, errcode, errmsg, headers): 1983 """Default error handler: close the connection and raise OSError.""" 1984 fp.close() 1985 raise HTTPError(url, errcode, errmsg, headers, None) 1986 1987 if _have_ssl: 1988 def _https_connection(self, host): 1989 return http.client.HTTPSConnection(host, 1990 key_file=self.key_file, 1991 cert_file=self.cert_file) 1992 1993 def open_https(self, url, data=None): 1994 """Use HTTPS protocol.""" 1995 return self._open_generic_http(self._https_connection, url, data) 1996 1997 def open_file(self, url): 1998 """Use local file or FTP depending on form of URL.""" 1999 if not isinstance(url, str): 2000 raise URLError('file error: proxy support for file protocol currently not implemented') 2001 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 2002 raise ValueError("file:// scheme is supported only on localhost") 2003 else: 2004 return self.open_local_file(url) 2005 2006 def open_local_file(self, url): 2007 """Use local file.""" 2008 import email.utils 2009 import mimetypes 2010 host, file = _splithost(url) 2011 localname = url2pathname(file) 2012 try: 2013 stats = os.stat(localname) 2014 except OSError as e: 2015 raise URLError(e.strerror, e.filename) 2016 size = stats.st_size 2017 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 2018 mtype = mimetypes.guess_type(url)[0] 2019 headers = email.message_from_string( 2020 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 2021 (mtype or 'text/plain', size, modified)) 2022 if not host: 2023 urlfile = file 2024 if file[:1] == '/': 2025 urlfile = 'file://' + file 2026 return addinfourl(open(localname, 'rb'), headers, urlfile) 2027 host, port = _splitport(host) 2028 if (not port 2029 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2030 urlfile = file 2031 if file[:1] == '/': 2032 urlfile = 'file://' + file 2033 elif file[:2] == './': 2034 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2035 return addinfourl(open(localname, 'rb'), headers, urlfile) 2036 raise URLError('local file error: not on local host') 2037 2038 def open_ftp(self, url): 2039 """Use FTP protocol.""" 2040 if not isinstance(url, str): 2041 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2042 import mimetypes 2043 host, path = _splithost(url) 2044 if not host: raise URLError('ftp error: no host given') 2045 host, port = _splitport(host) 2046 user, host = _splituser(host) 2047 if user: user, passwd = _splitpasswd(user) 2048 else: passwd = None 2049 host = unquote(host) 2050 user = unquote(user or '') 2051 passwd = unquote(passwd or '') 2052 host = socket.gethostbyname(host) 2053 if not port: 2054 import ftplib 2055 port = ftplib.FTP_PORT 2056 else: 2057 port = int(port) 2058 path, attrs = _splitattr(path) 2059 path = unquote(path) 2060 dirs = path.split('/') 2061 dirs, file = dirs[:-1], dirs[-1] 2062 if dirs and not dirs[0]: dirs = dirs[1:] 2063 if dirs and not dirs[0]: dirs[0] = '/' 2064 key = user, host, port, '/'.join(dirs) 2065 # XXX thread unsafe! 2066 if len(self.ftpcache) > MAXFTPCACHE: 2067 # Prune the cache, rather arbitrarily 2068 for k in list(self.ftpcache): 2069 if k != key: 2070 v = self.ftpcache[k] 2071 del self.ftpcache[k] 2072 v.close() 2073 try: 2074 if key not in self.ftpcache: 2075 self.ftpcache[key] = \ 2076 ftpwrapper(user, passwd, host, port, dirs) 2077 if not file: type = 'D' 2078 else: type = 'I' 2079 for attr in attrs: 2080 attr, value = _splitvalue(attr) 2081 if attr.lower() == 'type' and \ 2082 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2083 type = value.upper() 2084 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2085 mtype = mimetypes.guess_type("ftp:" + url)[0] 2086 headers = "" 2087 if mtype: 2088 headers += "Content-Type: %s\n" % mtype 2089 if retrlen is not None and retrlen >= 0: 2090 headers += "Content-Length: %d\n" % retrlen 2091 headers = email.message_from_string(headers) 2092 return addinfourl(fp, headers, "ftp:" + url) 2093 except ftperrors() as exp: 2094 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2]) 2095 2096 def open_data(self, url, data=None): 2097 """Use "data" URL.""" 2098 if not isinstance(url, str): 2099 raise URLError('data error: proxy support for data protocol currently not implemented') 2100 # ignore POSTed data 2101 # 2102 # syntax of data URLs: 2103 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2104 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2105 # data := *urlchar 2106 # parameter := attribute "=" value 2107 try: 2108 [type, data] = url.split(',', 1) 2109 except ValueError: 2110 raise OSError('data error', 'bad data URL') 2111 if not type: 2112 type = 'text/plain;charset=US-ASCII' 2113 semi = type.rfind(';') 2114 if semi >= 0 and '=' not in type[semi:]: 2115 encoding = type[semi+1:] 2116 type = type[:semi] 2117 else: 2118 encoding = '' 2119 msg = [] 2120 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2121 time.gmtime(time.time()))) 2122 msg.append('Content-type: %s' % type) 2123 if encoding == 'base64': 2124 # XXX is this encoding/decoding ok? 2125 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2126 else: 2127 data = unquote(data) 2128 msg.append('Content-Length: %d' % len(data)) 2129 msg.append('') 2130 msg.append(data) 2131 msg = '\n'.join(msg) 2132 headers = email.message_from_string(msg) 2133 f = io.StringIO(msg) 2134 #f.fileno = None # needed for addinfourl 2135 return addinfourl(f, headers, url) 2136 2137 2138class FancyURLopener(URLopener): 2139 """Derived class with handlers for errors we can handle (perhaps).""" 2140 2141 def __init__(self, *args, **kwargs): 2142 URLopener.__init__(self, *args, **kwargs) 2143 self.auth_cache = {} 2144 self.tries = 0 2145 self.maxtries = 10 2146 2147 def http_error_default(self, url, fp, errcode, errmsg, headers): 2148 """Default error handling -- don't raise an exception.""" 2149 return addinfourl(fp, headers, "http:" + url, errcode) 2150 2151 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2152 """Error 302 -- relocated (temporarily).""" 2153 self.tries += 1 2154 try: 2155 if self.maxtries and self.tries >= self.maxtries: 2156 if hasattr(self, "http_error_500"): 2157 meth = self.http_error_500 2158 else: 2159 meth = self.http_error_default 2160 return meth(url, fp, 500, 2161 "Internal Server Error: Redirect Recursion", 2162 headers) 2163 result = self.redirect_internal(url, fp, errcode, errmsg, 2164 headers, data) 2165 return result 2166 finally: 2167 self.tries = 0 2168 2169 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2170 if 'location' in headers: 2171 newurl = headers['location'] 2172 elif 'uri' in headers: 2173 newurl = headers['uri'] 2174 else: 2175 return 2176 fp.close() 2177 2178 # In case the server sent a relative URL, join with original: 2179 newurl = urljoin(self.type + ":" + url, newurl) 2180 2181 urlparts = urlparse(newurl) 2182 2183 # For security reasons, we don't allow redirection to anything other 2184 # than http, https and ftp. 2185 2186 # We are using newer HTTPError with older redirect_internal method 2187 # This older method will get deprecated in 3.3 2188 2189 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2190 raise HTTPError(newurl, errcode, 2191 errmsg + 2192 " Redirection to url '%s' is not allowed." % newurl, 2193 headers, fp) 2194 2195 return self.open(newurl) 2196 2197 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2198 """Error 301 -- also relocated (permanently).""" 2199 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2200 2201 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2202 """Error 303 -- also relocated (essentially identical to 302).""" 2203 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2204 2205 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2206 """Error 307 -- relocated, but turn POST into error.""" 2207 if data is None: 2208 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2209 else: 2210 return self.http_error_default(url, fp, errcode, errmsg, headers) 2211 2212 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2213 retry=False): 2214 """Error 401 -- authentication required. 2215 This function supports Basic authentication only.""" 2216 if 'www-authenticate' not in headers: 2217 URLopener.http_error_default(self, url, fp, 2218 errcode, errmsg, headers) 2219 stuff = headers['www-authenticate'] 2220 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2221 if not match: 2222 URLopener.http_error_default(self, url, fp, 2223 errcode, errmsg, headers) 2224 scheme, realm = match.groups() 2225 if scheme.lower() != 'basic': 2226 URLopener.http_error_default(self, url, fp, 2227 errcode, errmsg, headers) 2228 if not retry: 2229 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2230 headers) 2231 name = 'retry_' + self.type + '_basic_auth' 2232 if data is None: 2233 return getattr(self,name)(url, realm) 2234 else: 2235 return getattr(self,name)(url, realm, data) 2236 2237 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2238 retry=False): 2239 """Error 407 -- proxy authentication required. 2240 This function supports Basic authentication only.""" 2241 if 'proxy-authenticate' not in headers: 2242 URLopener.http_error_default(self, url, fp, 2243 errcode, errmsg, headers) 2244 stuff = headers['proxy-authenticate'] 2245 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2246 if not match: 2247 URLopener.http_error_default(self, url, fp, 2248 errcode, errmsg, headers) 2249 scheme, realm = match.groups() 2250 if scheme.lower() != 'basic': 2251 URLopener.http_error_default(self, url, fp, 2252 errcode, errmsg, headers) 2253 if not retry: 2254 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2255 headers) 2256 name = 'retry_proxy_' + self.type + '_basic_auth' 2257 if data is None: 2258 return getattr(self,name)(url, realm) 2259 else: 2260 return getattr(self,name)(url, realm, data) 2261 2262 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2263 host, selector = _splithost(url) 2264 newurl = 'http://' + host + selector 2265 proxy = self.proxies['http'] 2266 urltype, proxyhost = _splittype(proxy) 2267 proxyhost, proxyselector = _splithost(proxyhost) 2268 i = proxyhost.find('@') + 1 2269 proxyhost = proxyhost[i:] 2270 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2271 if not (user or passwd): return None 2272 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2273 quote(passwd, safe=''), proxyhost) 2274 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2275 if data is None: 2276 return self.open(newurl) 2277 else: 2278 return self.open(newurl, data) 2279 2280 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2281 host, selector = _splithost(url) 2282 newurl = 'https://' + host + selector 2283 proxy = self.proxies['https'] 2284 urltype, proxyhost = _splittype(proxy) 2285 proxyhost, proxyselector = _splithost(proxyhost) 2286 i = proxyhost.find('@') + 1 2287 proxyhost = proxyhost[i:] 2288 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2289 if not (user or passwd): return None 2290 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2291 quote(passwd, safe=''), proxyhost) 2292 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2293 if data is None: 2294 return self.open(newurl) 2295 else: 2296 return self.open(newurl, data) 2297 2298 def retry_http_basic_auth(self, url, realm, data=None): 2299 host, selector = _splithost(url) 2300 i = host.find('@') + 1 2301 host = host[i:] 2302 user, passwd = self.get_user_passwd(host, realm, i) 2303 if not (user or passwd): return None 2304 host = "%s:%s@%s" % (quote(user, safe=''), 2305 quote(passwd, safe=''), host) 2306 newurl = 'http://' + host + selector 2307 if data is None: 2308 return self.open(newurl) 2309 else: 2310 return self.open(newurl, data) 2311 2312 def retry_https_basic_auth(self, url, realm, data=None): 2313 host, selector = _splithost(url) 2314 i = host.find('@') + 1 2315 host = host[i:] 2316 user, passwd = self.get_user_passwd(host, realm, i) 2317 if not (user or passwd): return None 2318 host = "%s:%s@%s" % (quote(user, safe=''), 2319 quote(passwd, safe=''), host) 2320 newurl = 'https://' + host + selector 2321 if data is None: 2322 return self.open(newurl) 2323 else: 2324 return self.open(newurl, data) 2325 2326 def get_user_passwd(self, host, realm, clear_cache=0): 2327 key = realm + '@' + host.lower() 2328 if key in self.auth_cache: 2329 if clear_cache: 2330 del self.auth_cache[key] 2331 else: 2332 return self.auth_cache[key] 2333 user, passwd = self.prompt_user_passwd(host, realm) 2334 if user or passwd: self.auth_cache[key] = (user, passwd) 2335 return user, passwd 2336 2337 def prompt_user_passwd(self, host, realm): 2338 """Override this in a GUI environment!""" 2339 import getpass 2340 try: 2341 user = input("Enter username for %s at %s: " % (realm, host)) 2342 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2343 (user, realm, host)) 2344 return user, passwd 2345 except KeyboardInterrupt: 2346 print() 2347 return None, None 2348 2349 2350# Utility functions 2351 2352_localhost = None 2353def localhost(): 2354 """Return the IP address of the magic hostname 'localhost'.""" 2355 global _localhost 2356 if _localhost is None: 2357 _localhost = socket.gethostbyname('localhost') 2358 return _localhost 2359 2360_thishost = None 2361def thishost(): 2362 """Return the IP addresses of the current host.""" 2363 global _thishost 2364 if _thishost is None: 2365 try: 2366 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2367 except socket.gaierror: 2368 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2369 return _thishost 2370 2371_ftperrors = None 2372def ftperrors(): 2373 """Return the set of errors raised by the FTP class.""" 2374 global _ftperrors 2375 if _ftperrors is None: 2376 import ftplib 2377 _ftperrors = ftplib.all_errors 2378 return _ftperrors 2379 2380_noheaders = None 2381def noheaders(): 2382 """Return an empty email Message object.""" 2383 global _noheaders 2384 if _noheaders is None: 2385 _noheaders = email.message_from_string("") 2386 return _noheaders 2387 2388 2389# Utility classes 2390 2391class ftpwrapper: 2392 """Class used by open_ftp() for cache of open FTP connections.""" 2393 2394 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2395 persistent=True): 2396 self.user = user 2397 self.passwd = passwd 2398 self.host = host 2399 self.port = port 2400 self.dirs = dirs 2401 self.timeout = timeout 2402 self.refcount = 0 2403 self.keepalive = persistent 2404 try: 2405 self.init() 2406 except: 2407 self.close() 2408 raise 2409 2410 def init(self): 2411 import ftplib 2412 self.busy = 0 2413 self.ftp = ftplib.FTP() 2414 self.ftp.connect(self.host, self.port, self.timeout) 2415 self.ftp.login(self.user, self.passwd) 2416 _target = '/'.join(self.dirs) 2417 self.ftp.cwd(_target) 2418 2419 def retrfile(self, file, type): 2420 import ftplib 2421 self.endtransfer() 2422 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2423 else: cmd = 'TYPE ' + type; isdir = 0 2424 try: 2425 self.ftp.voidcmd(cmd) 2426 except ftplib.all_errors: 2427 self.init() 2428 self.ftp.voidcmd(cmd) 2429 conn = None 2430 if file and not isdir: 2431 # Try to retrieve as a file 2432 try: 2433 cmd = 'RETR ' + file 2434 conn, retrlen = self.ftp.ntransfercmd(cmd) 2435 except ftplib.error_perm as reason: 2436 if str(reason)[:3] != '550': 2437 raise URLError('ftp error: %r' % reason).with_traceback( 2438 sys.exc_info()[2]) 2439 if not conn: 2440 # Set transfer mode to ASCII! 2441 self.ftp.voidcmd('TYPE A') 2442 # Try a directory listing. Verify that directory exists. 2443 if file: 2444 pwd = self.ftp.pwd() 2445 try: 2446 try: 2447 self.ftp.cwd(file) 2448 except ftplib.error_perm as reason: 2449 raise URLError('ftp error: %r' % reason) from reason 2450 finally: 2451 self.ftp.cwd(pwd) 2452 cmd = 'LIST ' + file 2453 else: 2454 cmd = 'LIST' 2455 conn, retrlen = self.ftp.ntransfercmd(cmd) 2456 self.busy = 1 2457 2458 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2459 self.refcount += 1 2460 conn.close() 2461 # Pass back both a suitably decorated object and a retrieval length 2462 return (ftpobj, retrlen) 2463 2464 def endtransfer(self): 2465 self.busy = 0 2466 2467 def close(self): 2468 self.keepalive = False 2469 if self.refcount <= 0: 2470 self.real_close() 2471 2472 def file_close(self): 2473 self.endtransfer() 2474 self.refcount -= 1 2475 if self.refcount <= 0 and not self.keepalive: 2476 self.real_close() 2477 2478 def real_close(self): 2479 self.endtransfer() 2480 try: 2481 self.ftp.close() 2482 except ftperrors(): 2483 pass 2484 2485# Proxy handling 2486def getproxies_environment(): 2487 """Return a dictionary of scheme -> proxy server URL mappings. 2488 2489 Scan the environment for variables named <scheme>_proxy; 2490 this seems to be the standard convention. If you need a 2491 different way, you can pass a proxies dictionary to the 2492 [Fancy]URLopener constructor. 2493 2494 """ 2495 proxies = {} 2496 # in order to prefer lowercase variables, process environment in 2497 # two passes: first matches any, second pass matches lowercase only 2498 for name, value in os.environ.items(): 2499 name = name.lower() 2500 if value and name[-6:] == '_proxy': 2501 proxies[name[:-6]] = value 2502 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2503 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2504 # header from the client 2505 # If "proxy" is lowercase, it will still be used thanks to the next block 2506 if 'REQUEST_METHOD' in os.environ: 2507 proxies.pop('http', None) 2508 for name, value in os.environ.items(): 2509 if name[-6:] == '_proxy': 2510 name = name.lower() 2511 if value: 2512 proxies[name[:-6]] = value 2513 else: 2514 proxies.pop(name[:-6], None) 2515 return proxies 2516 2517def proxy_bypass_environment(host, proxies=None): 2518 """Test if proxies should not be used for a particular host. 2519 2520 Checks the proxy dict for the value of no_proxy, which should 2521 be a list of comma separated DNS suffixes, or '*' for all hosts. 2522 2523 """ 2524 if proxies is None: 2525 proxies = getproxies_environment() 2526 # don't bypass, if no_proxy isn't specified 2527 try: 2528 no_proxy = proxies['no'] 2529 except KeyError: 2530 return False 2531 # '*' is special case for always bypass 2532 if no_proxy == '*': 2533 return True 2534 host = host.lower() 2535 # strip port off host 2536 hostonly, port = _splitport(host) 2537 # check if the host ends with any of the DNS suffixes 2538 for name in no_proxy.split(','): 2539 name = name.strip() 2540 if name: 2541 name = name.lstrip('.') # ignore leading dots 2542 name = name.lower() 2543 if hostonly == name or host == name: 2544 return True 2545 name = '.' + name 2546 if hostonly.endswith(name) or host.endswith(name): 2547 return True 2548 # otherwise, don't bypass 2549 return False 2550 2551 2552# This code tests an OSX specific data structure but is testable on all 2553# platforms 2554def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2555 """ 2556 Return True iff this host shouldn't be accessed using a proxy 2557 2558 This function uses the MacOSX framework SystemConfiguration 2559 to fetch the proxy information. 2560 2561 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2562 { 'exclude_simple': bool, 2563 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2564 } 2565 """ 2566 from fnmatch import fnmatch 2567 2568 hostonly, port = _splitport(host) 2569 2570 def ip2num(ipAddr): 2571 parts = ipAddr.split('.') 2572 parts = list(map(int, parts)) 2573 if len(parts) != 4: 2574 parts = (parts + [0, 0, 0, 0])[:4] 2575 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2576 2577 # Check for simple host names: 2578 if '.' not in host: 2579 if proxy_settings['exclude_simple']: 2580 return True 2581 2582 hostIP = None 2583 2584 for value in proxy_settings.get('exceptions', ()): 2585 # Items in the list are strings like these: *.local, 169.254/16 2586 if not value: continue 2587 2588 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2589 if m is not None: 2590 if hostIP is None: 2591 try: 2592 hostIP = socket.gethostbyname(hostonly) 2593 hostIP = ip2num(hostIP) 2594 except OSError: 2595 continue 2596 2597 base = ip2num(m.group(1)) 2598 mask = m.group(2) 2599 if mask is None: 2600 mask = 8 * (m.group(1).count('.') + 1) 2601 else: 2602 mask = int(mask[1:]) 2603 2604 if mask < 0 or mask > 32: 2605 # System libraries ignore invalid prefix lengths 2606 continue 2607 2608 mask = 32 - mask 2609 2610 if (hostIP >> mask) == (base >> mask): 2611 return True 2612 2613 elif fnmatch(host, value): 2614 return True 2615 2616 return False 2617 2618 2619if sys.platform == 'darwin': 2620 from _scproxy import _get_proxy_settings, _get_proxies 2621 2622 def proxy_bypass_macosx_sysconf(host): 2623 proxy_settings = _get_proxy_settings() 2624 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2625 2626 def getproxies_macosx_sysconf(): 2627 """Return a dictionary of scheme -> proxy server URL mappings. 2628 2629 This function uses the MacOSX framework SystemConfiguration 2630 to fetch the proxy information. 2631 """ 2632 return _get_proxies() 2633 2634 2635 2636 def proxy_bypass(host): 2637 """Return True, if host should be bypassed. 2638 2639 Checks proxy settings gathered from the environment, if specified, 2640 or from the MacOSX framework SystemConfiguration. 2641 2642 """ 2643 proxies = getproxies_environment() 2644 if proxies: 2645 return proxy_bypass_environment(host, proxies) 2646 else: 2647 return proxy_bypass_macosx_sysconf(host) 2648 2649 def getproxies(): 2650 return getproxies_environment() or getproxies_macosx_sysconf() 2651 2652 2653elif os.name == 'nt': 2654 def getproxies_registry(): 2655 """Return a dictionary of scheme -> proxy server URL mappings. 2656 2657 Win32 uses the registry to store proxies. 2658 2659 """ 2660 proxies = {} 2661 try: 2662 import winreg 2663 except ImportError: 2664 # Std module, so should be around - but you never know! 2665 return proxies 2666 try: 2667 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2668 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2669 proxyEnable = winreg.QueryValueEx(internetSettings, 2670 'ProxyEnable')[0] 2671 if proxyEnable: 2672 # Returned as Unicode but problems if not converted to ASCII 2673 proxyServer = str(winreg.QueryValueEx(internetSettings, 2674 'ProxyServer')[0]) 2675 if '=' in proxyServer: 2676 # Per-protocol settings 2677 for p in proxyServer.split(';'): 2678 protocol, address = p.split('=', 1) 2679 # See if address has a type:// prefix 2680 if not re.match('(?:[^/:]+)://', address): 2681 address = '%s://%s' % (protocol, address) 2682 proxies[protocol] = address 2683 else: 2684 # Use one setting for all protocols 2685 if proxyServer[:5] == 'http:': 2686 proxies['http'] = proxyServer 2687 else: 2688 proxies['http'] = 'http://%s' % proxyServer 2689 proxies['https'] = 'https://%s' % proxyServer 2690 proxies['ftp'] = 'ftp://%s' % proxyServer 2691 internetSettings.Close() 2692 except (OSError, ValueError, TypeError): 2693 # Either registry key not found etc, or the value in an 2694 # unexpected format. 2695 # proxies already set up to be empty so nothing to do 2696 pass 2697 return proxies 2698 2699 def getproxies(): 2700 """Return a dictionary of scheme -> proxy server URL mappings. 2701 2702 Returns settings gathered from the environment, if specified, 2703 or the registry. 2704 2705 """ 2706 return getproxies_environment() or getproxies_registry() 2707 2708 def proxy_bypass_registry(host): 2709 try: 2710 import winreg 2711 except ImportError: 2712 # Std modules, so should be around - but you never know! 2713 return 0 2714 try: 2715 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2716 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2717 proxyEnable = winreg.QueryValueEx(internetSettings, 2718 'ProxyEnable')[0] 2719 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2720 'ProxyOverride')[0]) 2721 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2722 except OSError: 2723 return 0 2724 if not proxyEnable or not proxyOverride: 2725 return 0 2726 # try to make a host list from name and IP address. 2727 rawHost, port = _splitport(host) 2728 host = [rawHost] 2729 try: 2730 addr = socket.gethostbyname(rawHost) 2731 if addr != rawHost: 2732 host.append(addr) 2733 except OSError: 2734 pass 2735 try: 2736 fqdn = socket.getfqdn(rawHost) 2737 if fqdn != rawHost: 2738 host.append(fqdn) 2739 except OSError: 2740 pass 2741 # make a check value list from the registry entry: replace the 2742 # '<local>' string by the localhost entry and the corresponding 2743 # canonical entry. 2744 proxyOverride = proxyOverride.split(';') 2745 # now check if we match one of the registry values. 2746 for test in proxyOverride: 2747 if test == '<local>': 2748 if '.' not in rawHost: 2749 return 1 2750 test = test.replace(".", r"\.") # mask dots 2751 test = test.replace("*", r".*") # change glob sequence 2752 test = test.replace("?", r".") # change glob char 2753 for val in host: 2754 if re.match(test, val, re.I): 2755 return 1 2756 return 0 2757 2758 def proxy_bypass(host): 2759 """Return True, if host should be bypassed. 2760 2761 Checks proxy settings gathered from the environment, if specified, 2762 or the registry. 2763 2764 """ 2765 proxies = getproxies_environment() 2766 if proxies: 2767 return proxy_bypass_environment(host, proxies) 2768 else: 2769 return proxy_bypass_registry(host) 2770 2771else: 2772 # By default use environment variables 2773 getproxies = getproxies_environment 2774 proxy_bypass = proxy_bypass_environment 2775