1"""An extensible library for opening URLs using a variety of protocols 2 3The simplest way to use this module is to call the urlopen function, 4which accepts a string containing a URL or a Request object (described 5below). It opens the URL and returns the results as file-like 6object; the returned object has some extra methods described below. 7 8The OpenerDirector manages a collection of Handler objects that do 9all the actual work. Each Handler implements a particular protocol or 10option. The OpenerDirector is a composite object that invokes the 11Handlers needed to open the requested URL. For example, the 12HTTPHandler performs HTTP GET and POST requests and deals with 13non-error returns. The HTTPRedirectHandler automatically deals with 14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15deals with digest authentication. 16 17urlopen(url, data=None) -- Basic usage is the same as original 18urllib. pass the url and optionally data to post to an HTTP URL, and 19get a file-like object back. One difference is that you can also pass 20a Request instance instead of URL. Raises a URLError (subclass of 21OSError); for HTTP errors, raises an HTTPError, which can also be 22treated as a valid response. 23 24build_opener -- Function that creates a new OpenerDirector instance. 25Will install the default handlers. Accepts one or more Handlers as 26arguments, either instances or Handler classes that it will 27instantiate. If one of the argument is a subclass of the default 28handler, the argument will be installed instead of the default. 29 30install_opener -- Installs a new opener as the default opener. 31 32objects of interest: 33 34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35the Handler classes, while dealing with requests and responses. 36 37Request -- An object that encapsulates the state of a request. The 38state can be as simple as the URL. It can also include extra HTTP 39headers, e.g. a User-Agent. 40 41BaseHandler -- 42 43internals: 44BaseHandler and parent 45_call_chain conventions 46 47Example usage: 48 49import urllib.request 50 51# set up authentication info 52authinfo = urllib.request.HTTPBasicAuthHandler() 53authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60# build a new opener that adds authentication and caching FTP handlers 61opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64# install it 65urllib.request.install_opener(opener) 66 67f = urllib.request.urlopen('https://www.python.org/') 68""" 69 70# XXX issues: 71# If an authentication error handler that tries to perform 72# authentication for some reason but fails, how should the error be 73# signalled? The client needs to know the HTTP error code. But if 74# the handler knows that the problem was, e.g., that it didn't know 75# that hash algo that requested in the challenge, it would be good to 76# pass that information along to the client, too. 77# ftp errors aren't handled cleanly 78# check digest against correct (i.e. non-apache) implementation 79 80# Possible extensions: 81# complex proxies XXX not sure what exactly was meant by this 82# abstract factory for opener 83 84import base64 85import bisect 86import email 87import hashlib 88import http.client 89import io 90import os 91import posixpath 92import re 93import socket 94import string 95import sys 96import time 97import tempfile 98import contextlib 99import warnings 100 101 102from urllib.error import URLError, HTTPError, ContentTooShortError 103from urllib.parse import ( 104 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 105 _splittype, _splithost, _splitport, _splituser, _splitpasswd, 106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, 107 unquote_to_bytes, urlunparse) 108from urllib.response import addinfourl, addclosehook 109 110# check for SSL 111try: 112 import ssl 113except ImportError: 114 _have_ssl = False 115else: 116 _have_ssl = True 117 118__all__ = [ 119 # Classes 120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 127 'UnknownHandler', 'HTTPErrorProcessor', 128 # Functions 129 'urlopen', 'install_opener', 'build_opener', 130 'pathname2url', 'url2pathname', 'getproxies', 131 # Legacy interface 132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 133] 134 135# used in User-Agent header sent 136__version__ = '%d.%d' % sys.version_info[:2] 137 138_opener = None 139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 140 *, cafile=None, capath=None, cadefault=False, context=None): 141 '''Open the URL url, which can be either a string or a Request object. 142 143 *data* must be an object specifying additional data to be sent to 144 the server, or None if no such data is needed. See Request for 145 details. 146 147 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 148 header in its HTTP requests. 149 150 The optional *timeout* parameter specifies a timeout in seconds for 151 blocking operations like the connection attempt (if not specified, the 152 global default timeout setting will be used). This only works for HTTP, 153 HTTPS and FTP connections. 154 155 If *context* is specified, it must be a ssl.SSLContext instance describing 156 the various SSL options. See HTTPSConnection for more details. 157 158 The optional *cafile* and *capath* parameters specify a set of trusted CA 159 certificates for HTTPS requests. cafile should point to a single file 160 containing a bundle of CA certificates, whereas capath should point to a 161 directory of hashed certificate files. More information can be found in 162 ssl.SSLContext.load_verify_locations(). 163 164 The *cadefault* parameter is ignored. 165 166 167 This function always returns an object which can work as a 168 context manager and has the properties url, headers, and status. 169 See urllib.response.addinfourl for more detail on these properties. 170 171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 172 object slightly modified. In addition to the three new methods above, the 173 msg attribute contains the same information as the reason attribute --- 174 the reason phrase returned by the server --- instead of the response 175 headers as it is specified in the documentation for HTTPResponse. 176 177 For FTP, file, and data URLs and requests explicitly handled by legacy 178 URLopener and FancyURLopener classes, this function returns a 179 urllib.response.addinfourl object. 180 181 Note that None may be returned if no handler handles the request (though 182 the default installed global OpenerDirector uses UnknownHandler to ensure 183 this never happens). 184 185 In addition, if proxy settings are detected (for example, when a *_proxy 186 environment variable like http_proxy is set), ProxyHandler is default 187 installed and makes sure the requests are handled through the proxy. 188 189 ''' 190 global _opener 191 if cafile or capath or cadefault: 192 import warnings 193 warnings.warn("cafile, capath and cadefault are deprecated, use a " 194 "custom context instead.", DeprecationWarning, 2) 195 if context is not None: 196 raise ValueError( 197 "You can't pass both context and any of cafile, capath, and " 198 "cadefault" 199 ) 200 if not _have_ssl: 201 raise ValueError('SSL support not available') 202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, 203 cafile=cafile, 204 capath=capath) 205 # send ALPN extension to indicate HTTP/1.1 protocol 206 context.set_alpn_protocols(['http/1.1']) 207 https_handler = HTTPSHandler(context=context) 208 opener = build_opener(https_handler) 209 elif context: 210 https_handler = HTTPSHandler(context=context) 211 opener = build_opener(https_handler) 212 elif _opener is None: 213 _opener = opener = build_opener() 214 else: 215 opener = _opener 216 return opener.open(url, data, timeout) 217 218def install_opener(opener): 219 global _opener 220 _opener = opener 221 222_url_tempfiles = [] 223def urlretrieve(url, filename=None, reporthook=None, data=None): 224 """ 225 Retrieve a URL into a temporary location on disk. 226 227 Requires a URL argument. If a filename is passed, it is used as 228 the temporary file location. The reporthook argument should be 229 a callable that accepts a block number, a read size, and the 230 total file size of the URL target. The data argument should be 231 valid URL encoded data. 232 233 If a filename is passed and the URL points to a local resource, 234 the result is a copy from local file to new file. 235 236 Returns a tuple containing the path to the newly created 237 data file as well as the resulting HTTPMessage object. 238 """ 239 url_type, path = _splittype(url) 240 241 with contextlib.closing(urlopen(url, data)) as fp: 242 headers = fp.info() 243 244 # Just return the local path and the "headers" for file:// 245 # URLs. No sense in performing a copy unless requested. 246 if url_type == "file" and not filename: 247 return os.path.normpath(path), headers 248 249 # Handle temporary file setup. 250 if filename: 251 tfp = open(filename, 'wb') 252 else: 253 tfp = tempfile.NamedTemporaryFile(delete=False) 254 filename = tfp.name 255 _url_tempfiles.append(filename) 256 257 with tfp: 258 result = filename, headers 259 bs = 1024*8 260 size = -1 261 read = 0 262 blocknum = 0 263 if "content-length" in headers: 264 size = int(headers["Content-Length"]) 265 266 if reporthook: 267 reporthook(blocknum, bs, size) 268 269 while True: 270 block = fp.read(bs) 271 if not block: 272 break 273 read += len(block) 274 tfp.write(block) 275 blocknum += 1 276 if reporthook: 277 reporthook(blocknum, bs, size) 278 279 if size >= 0 and read < size: 280 raise ContentTooShortError( 281 "retrieval incomplete: got only %i out of %i bytes" 282 % (read, size), result) 283 284 return result 285 286def urlcleanup(): 287 """Clean up temporary files from urlretrieve calls.""" 288 for temp_file in _url_tempfiles: 289 try: 290 os.unlink(temp_file) 291 except OSError: 292 pass 293 294 del _url_tempfiles[:] 295 global _opener 296 if _opener: 297 _opener = None 298 299# copied from cookielib.py 300_cut_port_re = re.compile(r":\d+$", re.ASCII) 301def request_host(request): 302 """Return request-host, as defined by RFC 2965. 303 304 Variation from RFC: returned value is lowercased, for convenient 305 comparison. 306 307 """ 308 url = request.full_url 309 host = urlparse(url)[1] 310 if host == "": 311 host = request.get_header("Host", "") 312 313 # remove port, if present 314 host = _cut_port_re.sub("", host, 1) 315 return host.lower() 316 317class Request: 318 319 def __init__(self, url, data=None, headers={}, 320 origin_req_host=None, unverifiable=False, 321 method=None): 322 self.full_url = url 323 self.headers = {} 324 self.unredirected_hdrs = {} 325 self._data = None 326 self.data = data 327 self._tunnel_host = None 328 for key, value in headers.items(): 329 self.add_header(key, value) 330 if origin_req_host is None: 331 origin_req_host = request_host(self) 332 self.origin_req_host = origin_req_host 333 self.unverifiable = unverifiable 334 if method: 335 self.method = method 336 337 @property 338 def full_url(self): 339 if self.fragment: 340 return '{}#{}'.format(self._full_url, self.fragment) 341 return self._full_url 342 343 @full_url.setter 344 def full_url(self, url): 345 # unwrap('<URL:type://host/path>') --> 'type://host/path' 346 self._full_url = unwrap(url) 347 self._full_url, self.fragment = _splittag(self._full_url) 348 self._parse() 349 350 @full_url.deleter 351 def full_url(self): 352 self._full_url = None 353 self.fragment = None 354 self.selector = '' 355 356 @property 357 def data(self): 358 return self._data 359 360 @data.setter 361 def data(self, data): 362 if data != self._data: 363 self._data = data 364 # issue 16464 365 # if we change data we need to remove content-length header 366 # (cause it's most probably calculated for previous value) 367 if self.has_header("Content-length"): 368 self.remove_header("Content-length") 369 370 @data.deleter 371 def data(self): 372 self.data = None 373 374 def _parse(self): 375 self.type, rest = _splittype(self._full_url) 376 if self.type is None: 377 raise ValueError("unknown url type: %r" % self.full_url) 378 self.host, self.selector = _splithost(rest) 379 if self.host: 380 self.host = unquote(self.host) 381 382 def get_method(self): 383 """Return a string indicating the HTTP request method.""" 384 default_method = "POST" if self.data is not None else "GET" 385 return getattr(self, 'method', default_method) 386 387 def get_full_url(self): 388 return self.full_url 389 390 def set_proxy(self, host, type): 391 if self.type == 'https' and not self._tunnel_host: 392 self._tunnel_host = self.host 393 else: 394 self.type= type 395 self.selector = self.full_url 396 self.host = host 397 398 def has_proxy(self): 399 return self.selector == self.full_url 400 401 def add_header(self, key, val): 402 # useful for something like authentication 403 self.headers[key.capitalize()] = val 404 405 def add_unredirected_header(self, key, val): 406 # will not be added to a redirected request 407 self.unredirected_hdrs[key.capitalize()] = val 408 409 def has_header(self, header_name): 410 return (header_name in self.headers or 411 header_name in self.unredirected_hdrs) 412 413 def get_header(self, header_name, default=None): 414 return self.headers.get( 415 header_name, 416 self.unredirected_hdrs.get(header_name, default)) 417 418 def remove_header(self, header_name): 419 self.headers.pop(header_name, None) 420 self.unredirected_hdrs.pop(header_name, None) 421 422 def header_items(self): 423 hdrs = {**self.unredirected_hdrs, **self.headers} 424 return list(hdrs.items()) 425 426class OpenerDirector: 427 def __init__(self): 428 client_version = "Python-urllib/%s" % __version__ 429 self.addheaders = [('User-agent', client_version)] 430 # self.handlers is retained only for backward compatibility 431 self.handlers = [] 432 # manage the individual handlers 433 self.handle_open = {} 434 self.handle_error = {} 435 self.process_response = {} 436 self.process_request = {} 437 438 def add_handler(self, handler): 439 if not hasattr(handler, "add_parent"): 440 raise TypeError("expected BaseHandler instance, got %r" % 441 type(handler)) 442 443 added = False 444 for meth in dir(handler): 445 if meth in ["redirect_request", "do_open", "proxy_open"]: 446 # oops, coincidental match 447 continue 448 449 i = meth.find("_") 450 protocol = meth[:i] 451 condition = meth[i+1:] 452 453 if condition.startswith("error"): 454 j = condition.find("_") + i + 1 455 kind = meth[j+1:] 456 try: 457 kind = int(kind) 458 except ValueError: 459 pass 460 lookup = self.handle_error.get(protocol, {}) 461 self.handle_error[protocol] = lookup 462 elif condition == "open": 463 kind = protocol 464 lookup = self.handle_open 465 elif condition == "response": 466 kind = protocol 467 lookup = self.process_response 468 elif condition == "request": 469 kind = protocol 470 lookup = self.process_request 471 else: 472 continue 473 474 handlers = lookup.setdefault(kind, []) 475 if handlers: 476 bisect.insort(handlers, handler) 477 else: 478 handlers.append(handler) 479 added = True 480 481 if added: 482 bisect.insort(self.handlers, handler) 483 handler.add_parent(self) 484 485 def close(self): 486 # Only exists for backwards compatibility. 487 pass 488 489 def _call_chain(self, chain, kind, meth_name, *args): 490 # Handlers raise an exception if no one else should try to handle 491 # the request, or return None if they can't but another handler 492 # could. Otherwise, they return the response. 493 handlers = chain.get(kind, ()) 494 for handler in handlers: 495 func = getattr(handler, meth_name) 496 result = func(*args) 497 if result is not None: 498 return result 499 500 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 501 # accept a URL or a Request object 502 if isinstance(fullurl, str): 503 req = Request(fullurl, data) 504 else: 505 req = fullurl 506 if data is not None: 507 req.data = data 508 509 req.timeout = timeout 510 protocol = req.type 511 512 # pre-process request 513 meth_name = protocol+"_request" 514 for processor in self.process_request.get(protocol, []): 515 meth = getattr(processor, meth_name) 516 req = meth(req) 517 518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) 519 response = self._open(req, data) 520 521 # post-process response 522 meth_name = protocol+"_response" 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) 525 response = meth(req, response) 526 527 return response 528 529 def _open(self, req, data=None): 530 result = self._call_chain(self.handle_open, 'default', 531 'default_open', req) 532 if result: 533 return result 534 535 protocol = req.type 536 result = self._call_chain(self.handle_open, protocol, protocol + 537 '_open', req) 538 if result: 539 return result 540 541 return self._call_chain(self.handle_open, 'unknown', 542 'unknown_open', req) 543 544 def error(self, proto, *args): 545 if proto in ('http', 'https'): 546 # XXX http[s] protocols are special-cased 547 dict = self.handle_error['http'] # https is not different than http 548 proto = args[2] # YUCK! 549 meth_name = 'http_error_%s' % proto 550 http_err = 1 551 orig_args = args 552 else: 553 dict = self.handle_error 554 meth_name = proto + '_error' 555 http_err = 0 556 args = (dict, proto, meth_name) + args 557 result = self._call_chain(*args) 558 if result: 559 return result 560 561 if http_err: 562 args = (dict, 'default', 'http_error_default') + orig_args 563 return self._call_chain(*args) 564 565# XXX probably also want an abstract factory that knows when it makes 566# sense to skip a superclass in favor of a subclass and when it might 567# make sense to include both 568 569def build_opener(*handlers): 570 """Create an opener object from a list of handlers. 571 572 The opener will use several default handlers, including support 573 for HTTP, FTP and when applicable HTTPS. 574 575 If any of the handlers passed as arguments are subclasses of the 576 default handlers, the default handlers will not be used. 577 """ 578 opener = OpenerDirector() 579 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 580 HTTPDefaultErrorHandler, HTTPRedirectHandler, 581 FTPHandler, FileHandler, HTTPErrorProcessor, 582 DataHandler] 583 if hasattr(http.client, "HTTPSConnection"): 584 default_classes.append(HTTPSHandler) 585 skip = set() 586 for klass in default_classes: 587 for check in handlers: 588 if isinstance(check, type): 589 if issubclass(check, klass): 590 skip.add(klass) 591 elif isinstance(check, klass): 592 skip.add(klass) 593 for klass in skip: 594 default_classes.remove(klass) 595 596 for klass in default_classes: 597 opener.add_handler(klass()) 598 599 for h in handlers: 600 if isinstance(h, type): 601 h = h() 602 opener.add_handler(h) 603 return opener 604 605class BaseHandler: 606 handler_order = 500 607 608 def add_parent(self, parent): 609 self.parent = parent 610 611 def close(self): 612 # Only exists for backwards compatibility 613 pass 614 615 def __lt__(self, other): 616 if not hasattr(other, "handler_order"): 617 # Try to preserve the old behavior of having custom classes 618 # inserted after default ones (works only for custom user 619 # classes which are not aware of handler_order). 620 return True 621 return self.handler_order < other.handler_order 622 623 624class HTTPErrorProcessor(BaseHandler): 625 """Process HTTP error responses.""" 626 handler_order = 1000 # after all other processing 627 628 def http_response(self, request, response): 629 code, msg, hdrs = response.code, response.msg, response.info() 630 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 636 637 return response 638 639 https_response = http_response 640 641class HTTPDefaultErrorHandler(BaseHandler): 642 def http_error_default(self, req, fp, code, msg, hdrs): 643 raise HTTPError(req.full_url, code, msg, hdrs, fp) 644 645class HTTPRedirectHandler(BaseHandler): 646 # maximum number of redirections to any single URL 647 # this is needed because of the state that cookies introduce 648 max_repeats = 4 649 # maximum total number of redirections (regardless of URL) before 650 # assuming we're in a loop 651 max_redirections = 10 652 653 def redirect_request(self, req, fp, code, msg, headers, newurl): 654 """Return a Request or None in response to a redirect. 655 656 This is called by the http_error_30x methods when a 657 redirection response is received. If a redirection should 658 take place, return a new Request to allow http_error_30x to 659 perform the redirect. Otherwise, raise HTTPError if no-one 660 else should try to handle this url. Return None if you can't 661 but another Handler might. 662 """ 663 m = req.get_method() 664 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 665 or code in (301, 302, 303) and m == "POST")): 666 raise HTTPError(req.full_url, code, msg, headers, fp) 667 668 # Strictly (according to RFC 2616), 301 or 302 in response to 669 # a POST MUST NOT cause a redirection without confirmation 670 # from the user (of urllib.request, in this case). In practice, 671 # essentially all clients do redirect in this case, so we do 672 # the same. 673 674 # Be conciliant with URIs containing a space. This is mainly 675 # redundant with the more complete encoding done in http_error_302(), 676 # but it is kept for compatibility with other callers. 677 newurl = newurl.replace(' ', '%20') 678 679 CONTENT_HEADERS = ("content-length", "content-type") 680 newheaders = {k: v for k, v in req.headers.items() 681 if k.lower() not in CONTENT_HEADERS} 682 return Request(newurl, 683 headers=newheaders, 684 origin_req_host=req.origin_req_host, 685 unverifiable=True) 686 687 # Implementation note: To avoid the server sending us into an 688 # infinite loop, the request object needs to track what URLs we 689 # have already seen. Do this by adding a handler-specific 690 # attribute to the Request object. 691 def http_error_302(self, req, fp, code, msg, headers): 692 # Some servers (incorrectly) return multiple Location headers 693 # (so probably same goes for URI). Use first header. 694 if "location" in headers: 695 newurl = headers["location"] 696 elif "uri" in headers: 697 newurl = headers["uri"] 698 else: 699 return 700 701 # fix a possible malformed URL 702 urlparts = urlparse(newurl) 703 704 # For security reasons we don't allow redirection to anything other 705 # than http, https or ftp. 706 707 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 708 raise HTTPError( 709 newurl, code, 710 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 711 headers, fp) 712 713 if not urlparts.path and urlparts.netloc: 714 urlparts = list(urlparts) 715 urlparts[2] = "/" 716 newurl = urlunparse(urlparts) 717 718 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 719 # original bytes and percent-encode non-ASCII bytes, and any special 720 # characters such as the space. 721 newurl = quote( 722 newurl, encoding="iso-8859-1", safe=string.punctuation) 723 newurl = urljoin(req.full_url, newurl) 724 725 # XXX Probably want to forget about the state of the current 726 # request, although that might interact poorly with other 727 # handlers that also use handler-specific request attributes 728 new = self.redirect_request(req, fp, code, msg, headers, newurl) 729 if new is None: 730 return 731 732 # loop detection 733 # .redirect_dict has a key url if url was previously visited. 734 if hasattr(req, 'redirect_dict'): 735 visited = new.redirect_dict = req.redirect_dict 736 if (visited.get(newurl, 0) >= self.max_repeats or 737 len(visited) >= self.max_redirections): 738 raise HTTPError(req.full_url, code, 739 self.inf_msg + msg, headers, fp) 740 else: 741 visited = new.redirect_dict = req.redirect_dict = {} 742 visited[newurl] = visited.get(newurl, 0) + 1 743 744 # Don't close the fp until we are sure that we won't use it 745 # with HTTPError. 746 fp.read() 747 fp.close() 748 749 return self.parent.open(new, timeout=req.timeout) 750 751 http_error_301 = http_error_303 = http_error_307 = http_error_302 752 753 inf_msg = "The HTTP server returned a redirect error that would " \ 754 "lead to an infinite loop.\n" \ 755 "The last 30x error message was:\n" 756 757 758def _parse_proxy(proxy): 759 """Return (scheme, user, password, host/port) given a URL or an authority. 760 761 If a URL is supplied, it must have an authority (host:port) component. 762 According to RFC 3986, having an authority component means the URL must 763 have two slashes after the scheme. 764 """ 765 scheme, r_scheme = _splittype(proxy) 766 if not r_scheme.startswith("/"): 767 # authority 768 scheme = None 769 authority = proxy 770 else: 771 # URL 772 if not r_scheme.startswith("//"): 773 raise ValueError("proxy URL with no authority: %r" % proxy) 774 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 775 # and 3.3.), path is empty or starts with '/' 776 if '@' in r_scheme: 777 host_separator = r_scheme.find('@') 778 end = r_scheme.find("/", host_separator) 779 else: 780 end = r_scheme.find("/", 2) 781 if end == -1: 782 end = None 783 authority = r_scheme[2:end] 784 userinfo, hostport = _splituser(authority) 785 if userinfo is not None: 786 user, password = _splitpasswd(userinfo) 787 else: 788 user = password = None 789 return scheme, user, password, hostport 790 791class ProxyHandler(BaseHandler): 792 # Proxies must be in front 793 handler_order = 100 794 795 def __init__(self, proxies=None): 796 if proxies is None: 797 proxies = getproxies() 798 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 799 self.proxies = proxies 800 for type, url in proxies.items(): 801 type = type.lower() 802 setattr(self, '%s_open' % type, 803 lambda r, proxy=url, type=type, meth=self.proxy_open: 804 meth(r, proxy, type)) 805 806 def proxy_open(self, req, proxy, type): 807 orig_type = req.type 808 proxy_type, user, password, hostport = _parse_proxy(proxy) 809 if proxy_type is None: 810 proxy_type = orig_type 811 812 if req.host and proxy_bypass(req.host): 813 return None 814 815 if user and password: 816 user_pass = '%s:%s' % (unquote(user), 817 unquote(password)) 818 creds = base64.b64encode(user_pass.encode()).decode("ascii") 819 req.add_header('Proxy-authorization', 'Basic ' + creds) 820 hostport = unquote(hostport) 821 req.set_proxy(hostport, proxy_type) 822 if orig_type == proxy_type or orig_type == 'https': 823 # let other handlers take care of it 824 return None 825 else: 826 # need to start over, because the other handlers don't 827 # grok the proxy's URL type 828 # e.g. if we have a constructor arg proxies like so: 829 # {'http': 'ftp://proxy.example.com'}, we may end up turning 830 # a request for http://acme.example.com/a into one for 831 # ftp://proxy.example.com/a 832 return self.parent.open(req, timeout=req.timeout) 833 834class HTTPPasswordMgr: 835 836 def __init__(self): 837 self.passwd = {} 838 839 def add_password(self, realm, uri, user, passwd): 840 # uri could be a single URI or a sequence 841 if isinstance(uri, str): 842 uri = [uri] 843 if realm not in self.passwd: 844 self.passwd[realm] = {} 845 for default_port in True, False: 846 reduced_uri = tuple( 847 self.reduce_uri(u, default_port) for u in uri) 848 self.passwd[realm][reduced_uri] = (user, passwd) 849 850 def find_user_password(self, realm, authuri): 851 domains = self.passwd.get(realm, {}) 852 for default_port in True, False: 853 reduced_authuri = self.reduce_uri(authuri, default_port) 854 for uris, authinfo in domains.items(): 855 for uri in uris: 856 if self.is_suburi(uri, reduced_authuri): 857 return authinfo 858 return None, None 859 860 def reduce_uri(self, uri, default_port=True): 861 """Accept authority or URI and extract only the authority and path.""" 862 # note HTTP URLs do not have a userinfo component 863 parts = urlsplit(uri) 864 if parts[1]: 865 # URI 866 scheme = parts[0] 867 authority = parts[1] 868 path = parts[2] or '/' 869 else: 870 # host or host:port 871 scheme = None 872 authority = uri 873 path = '/' 874 host, port = _splitport(authority) 875 if default_port and port is None and scheme is not None: 876 dport = {"http": 80, 877 "https": 443, 878 }.get(scheme) 879 if dport is not None: 880 authority = "%s:%d" % (host, dport) 881 return authority, path 882 883 def is_suburi(self, base, test): 884 """Check if test is below base in a URI tree 885 886 Both args must be URIs in reduced form. 887 """ 888 if base == test: 889 return True 890 if base[0] != test[0]: 891 return False 892 common = posixpath.commonprefix((base[1], test[1])) 893 if len(common) == len(base[1]): 894 return True 895 return False 896 897 898class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 899 900 def find_user_password(self, realm, authuri): 901 user, password = HTTPPasswordMgr.find_user_password(self, realm, 902 authuri) 903 if user is not None: 904 return user, password 905 return HTTPPasswordMgr.find_user_password(self, None, authuri) 906 907 908class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 909 910 def __init__(self, *args, **kwargs): 911 self.authenticated = {} 912 super().__init__(*args, **kwargs) 913 914 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 915 self.update_authenticated(uri, is_authenticated) 916 # Add a default for prior auth requests 917 if realm is not None: 918 super().add_password(None, uri, user, passwd) 919 super().add_password(realm, uri, user, passwd) 920 921 def update_authenticated(self, uri, is_authenticated=False): 922 # uri could be a single URI or a sequence 923 if isinstance(uri, str): 924 uri = [uri] 925 926 for default_port in True, False: 927 for u in uri: 928 reduced_uri = self.reduce_uri(u, default_port) 929 self.authenticated[reduced_uri] = is_authenticated 930 931 def is_authenticated(self, authuri): 932 for default_port in True, False: 933 reduced_authuri = self.reduce_uri(authuri, default_port) 934 for uri in self.authenticated: 935 if self.is_suburi(uri, reduced_authuri): 936 return self.authenticated[uri] 937 938 939class AbstractBasicAuthHandler: 940 941 # XXX this allows for multiple auth-schemes, but will stupidly pick 942 # the last one with a realm specified. 943 944 # allow for double- and single-quoted realm values 945 # (single quotes are a violation of the RFC, but appear in the wild) 946 rx = re.compile('(?:^|,)' # start of the string or ',' 947 '[ \t]*' # optional whitespaces 948 '([^ \t,]+)' # scheme like "Basic" 949 '[ \t]+' # mandatory whitespaces 950 # realm=xxx 951 # realm='xxx' 952 # realm="xxx" 953 'realm=(["\']?)([^"\']*)\\2', 954 re.I) 955 956 # XXX could pre-emptively send auth info already accepted (RFC 2617, 957 # end of section 2, and section 1.2 immediately after "credentials" 958 # production). 959 960 def __init__(self, password_mgr=None): 961 if password_mgr is None: 962 password_mgr = HTTPPasswordMgr() 963 self.passwd = password_mgr 964 self.add_password = self.passwd.add_password 965 966 def _parse_realm(self, header): 967 # parse WWW-Authenticate header: accept multiple challenges per header 968 found_challenge = False 969 for mo in AbstractBasicAuthHandler.rx.finditer(header): 970 scheme, quote, realm = mo.groups() 971 if quote not in ['"', "'"]: 972 warnings.warn("Basic Auth Realm was unquoted", 973 UserWarning, 3) 974 975 yield (scheme, realm) 976 977 found_challenge = True 978 979 if not found_challenge: 980 if header: 981 scheme = header.split()[0] 982 else: 983 scheme = '' 984 yield (scheme, None) 985 986 def http_error_auth_reqed(self, authreq, host, req, headers): 987 # host may be an authority (without userinfo) or a URL with an 988 # authority 989 headers = headers.get_all(authreq) 990 if not headers: 991 # no header found 992 return 993 994 unsupported = None 995 for header in headers: 996 for scheme, realm in self._parse_realm(header): 997 if scheme.lower() != 'basic': 998 unsupported = scheme 999 continue 1000 1001 if realm is not None: 1002 # Use the first matching Basic challenge. 1003 # Ignore following challenges even if they use the Basic 1004 # scheme. 1005 return self.retry_http_basic_auth(host, req, realm) 1006 1007 if unsupported is not None: 1008 raise ValueError("AbstractBasicAuthHandler does not " 1009 "support the following scheme: %r" 1010 % (scheme,)) 1011 1012 def retry_http_basic_auth(self, host, req, realm): 1013 user, pw = self.passwd.find_user_password(realm, host) 1014 if pw is not None: 1015 raw = "%s:%s" % (user, pw) 1016 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 1017 if req.get_header(self.auth_header, None) == auth: 1018 return None 1019 req.add_unredirected_header(self.auth_header, auth) 1020 return self.parent.open(req, timeout=req.timeout) 1021 else: 1022 return None 1023 1024 def http_request(self, req): 1025 if (not hasattr(self.passwd, 'is_authenticated') or 1026 not self.passwd.is_authenticated(req.full_url)): 1027 return req 1028 1029 if not req.has_header('Authorization'): 1030 user, passwd = self.passwd.find_user_password(None, req.full_url) 1031 credentials = '{0}:{1}'.format(user, passwd).encode() 1032 auth_str = base64.standard_b64encode(credentials).decode() 1033 req.add_unredirected_header('Authorization', 1034 'Basic {}'.format(auth_str.strip())) 1035 return req 1036 1037 def http_response(self, req, response): 1038 if hasattr(self.passwd, 'is_authenticated'): 1039 if 200 <= response.code < 300: 1040 self.passwd.update_authenticated(req.full_url, True) 1041 else: 1042 self.passwd.update_authenticated(req.full_url, False) 1043 return response 1044 1045 https_request = http_request 1046 https_response = http_response 1047 1048 1049 1050class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1051 1052 auth_header = 'Authorization' 1053 1054 def http_error_401(self, req, fp, code, msg, headers): 1055 url = req.full_url 1056 response = self.http_error_auth_reqed('www-authenticate', 1057 url, req, headers) 1058 return response 1059 1060 1061class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1062 1063 auth_header = 'Proxy-authorization' 1064 1065 def http_error_407(self, req, fp, code, msg, headers): 1066 # http_error_auth_reqed requires that there is no userinfo component in 1067 # authority. Assume there isn't one, since urllib.request does not (and 1068 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1069 # userinfo. 1070 authority = req.host 1071 response = self.http_error_auth_reqed('proxy-authenticate', 1072 authority, req, headers) 1073 return response 1074 1075 1076# Return n random bytes. 1077_randombytes = os.urandom 1078 1079 1080class AbstractDigestAuthHandler: 1081 # Digest authentication is specified in RFC 2617. 1082 1083 # XXX The client does not inspect the Authentication-Info header 1084 # in a successful response. 1085 1086 # XXX It should be possible to test this implementation against 1087 # a mock server that just generates a static set of challenges. 1088 1089 # XXX qop="auth-int" supports is shaky 1090 1091 def __init__(self, passwd=None): 1092 if passwd is None: 1093 passwd = HTTPPasswordMgr() 1094 self.passwd = passwd 1095 self.add_password = self.passwd.add_password 1096 self.retried = 0 1097 self.nonce_count = 0 1098 self.last_nonce = None 1099 1100 def reset_retry_count(self): 1101 self.retried = 0 1102 1103 def http_error_auth_reqed(self, auth_header, host, req, headers): 1104 authreq = headers.get(auth_header, None) 1105 if self.retried > 5: 1106 # Don't fail endlessly - if we failed once, we'll probably 1107 # fail a second time. Hm. Unless the Password Manager is 1108 # prompting for the information. Crap. This isn't great 1109 # but it's better than the current 'repeat until recursion 1110 # depth exceeded' approach <wink> 1111 raise HTTPError(req.full_url, 401, "digest auth failed", 1112 headers, None) 1113 else: 1114 self.retried += 1 1115 if authreq: 1116 scheme = authreq.split()[0] 1117 if scheme.lower() == 'digest': 1118 return self.retry_http_digest_auth(req, authreq) 1119 elif scheme.lower() != 'basic': 1120 raise ValueError("AbstractDigestAuthHandler does not support" 1121 " the following scheme: '%s'" % scheme) 1122 1123 def retry_http_digest_auth(self, req, auth): 1124 token, challenge = auth.split(' ', 1) 1125 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1126 auth = self.get_authorization(req, chal) 1127 if auth: 1128 auth_val = 'Digest %s' % auth 1129 if req.headers.get(self.auth_header, None) == auth_val: 1130 return None 1131 req.add_unredirected_header(self.auth_header, auth_val) 1132 resp = self.parent.open(req, timeout=req.timeout) 1133 return resp 1134 1135 def get_cnonce(self, nonce): 1136 # The cnonce-value is an opaque 1137 # quoted string value provided by the client and used by both client 1138 # and server to avoid chosen plaintext attacks, to provide mutual 1139 # authentication, and to provide some message integrity protection. 1140 # This isn't a fabulous effort, but it's probably Good Enough. 1141 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1142 b = s.encode("ascii") + _randombytes(8) 1143 dig = hashlib.sha1(b).hexdigest() 1144 return dig[:16] 1145 1146 def get_authorization(self, req, chal): 1147 try: 1148 realm = chal['realm'] 1149 nonce = chal['nonce'] 1150 qop = chal.get('qop') 1151 algorithm = chal.get('algorithm', 'MD5') 1152 # mod_digest doesn't send an opaque, even though it isn't 1153 # supposed to be optional 1154 opaque = chal.get('opaque', None) 1155 except KeyError: 1156 return None 1157 1158 H, KD = self.get_algorithm_impls(algorithm) 1159 if H is None: 1160 return None 1161 1162 user, pw = self.passwd.find_user_password(realm, req.full_url) 1163 if user is None: 1164 return None 1165 1166 # XXX not implemented yet 1167 if req.data is not None: 1168 entdig = self.get_entity_digest(req.data, chal) 1169 else: 1170 entdig = None 1171 1172 A1 = "%s:%s:%s" % (user, realm, pw) 1173 A2 = "%s:%s" % (req.get_method(), 1174 # XXX selector: what about proxies and full urls 1175 req.selector) 1176 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth` 1177 # or `auth-int` to the response back. we use `auth` to send the response back. 1178 if qop is None: 1179 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1180 elif 'auth' in qop.split(','): 1181 if nonce == self.last_nonce: 1182 self.nonce_count += 1 1183 else: 1184 self.nonce_count = 1 1185 self.last_nonce = nonce 1186 ncvalue = '%08x' % self.nonce_count 1187 cnonce = self.get_cnonce(nonce) 1188 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2)) 1189 respdig = KD(H(A1), noncebit) 1190 else: 1191 # XXX handle auth-int. 1192 raise URLError("qop '%s' is not supported." % qop) 1193 1194 # XXX should the partial digests be encoded too? 1195 1196 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1197 'response="%s"' % (user, realm, nonce, req.selector, 1198 respdig) 1199 if opaque: 1200 base += ', opaque="%s"' % opaque 1201 if entdig: 1202 base += ', digest="%s"' % entdig 1203 base += ', algorithm="%s"' % algorithm 1204 if qop: 1205 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1206 return base 1207 1208 def get_algorithm_impls(self, algorithm): 1209 # lambdas assume digest modules are imported at the top level 1210 if algorithm == 'MD5': 1211 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1212 elif algorithm == 'SHA': 1213 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1214 # XXX MD5-sess 1215 else: 1216 raise ValueError("Unsupported digest authentication " 1217 "algorithm %r" % algorithm) 1218 KD = lambda s, d: H("%s:%s" % (s, d)) 1219 return H, KD 1220 1221 def get_entity_digest(self, data, chal): 1222 # XXX not implemented yet 1223 return None 1224 1225 1226class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1227 """An authentication protocol defined by RFC 2069 1228 1229 Digest authentication improves on basic authentication because it 1230 does not transmit passwords in the clear. 1231 """ 1232 1233 auth_header = 'Authorization' 1234 handler_order = 490 # before Basic auth 1235 1236 def http_error_401(self, req, fp, code, msg, headers): 1237 host = urlparse(req.full_url)[1] 1238 retry = self.http_error_auth_reqed('www-authenticate', 1239 host, req, headers) 1240 self.reset_retry_count() 1241 return retry 1242 1243 1244class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1245 1246 auth_header = 'Proxy-Authorization' 1247 handler_order = 490 # before Basic auth 1248 1249 def http_error_407(self, req, fp, code, msg, headers): 1250 host = req.host 1251 retry = self.http_error_auth_reqed('proxy-authenticate', 1252 host, req, headers) 1253 self.reset_retry_count() 1254 return retry 1255 1256class AbstractHTTPHandler(BaseHandler): 1257 1258 def __init__(self, debuglevel=0): 1259 self._debuglevel = debuglevel 1260 1261 def set_http_debuglevel(self, level): 1262 self._debuglevel = level 1263 1264 def _get_content_length(self, request): 1265 return http.client.HTTPConnection._get_content_length( 1266 request.data, 1267 request.get_method()) 1268 1269 def do_request_(self, request): 1270 host = request.host 1271 if not host: 1272 raise URLError('no host given') 1273 1274 if request.data is not None: # POST 1275 data = request.data 1276 if isinstance(data, str): 1277 msg = "POST data should be bytes, an iterable of bytes, " \ 1278 "or a file object. It cannot be of type str." 1279 raise TypeError(msg) 1280 if not request.has_header('Content-type'): 1281 request.add_unredirected_header( 1282 'Content-type', 1283 'application/x-www-form-urlencoded') 1284 if (not request.has_header('Content-length') 1285 and not request.has_header('Transfer-encoding')): 1286 content_length = self._get_content_length(request) 1287 if content_length is not None: 1288 request.add_unredirected_header( 1289 'Content-length', str(content_length)) 1290 else: 1291 request.add_unredirected_header( 1292 'Transfer-encoding', 'chunked') 1293 1294 sel_host = host 1295 if request.has_proxy(): 1296 scheme, sel = _splittype(request.selector) 1297 sel_host, sel_path = _splithost(sel) 1298 if not request.has_header('Host'): 1299 request.add_unredirected_header('Host', sel_host) 1300 for name, value in self.parent.addheaders: 1301 name = name.capitalize() 1302 if not request.has_header(name): 1303 request.add_unredirected_header(name, value) 1304 1305 return request 1306 1307 def do_open(self, http_class, req, **http_conn_args): 1308 """Return an HTTPResponse object for the request, using http_class. 1309 1310 http_class must implement the HTTPConnection API from http.client. 1311 """ 1312 host = req.host 1313 if not host: 1314 raise URLError('no host given') 1315 1316 # will parse host:port 1317 h = http_class(host, timeout=req.timeout, **http_conn_args) 1318 h.set_debuglevel(self._debuglevel) 1319 1320 headers = dict(req.unredirected_hdrs) 1321 headers.update({k: v for k, v in req.headers.items() 1322 if k not in headers}) 1323 1324 # TODO(jhylton): Should this be redesigned to handle 1325 # persistent connections? 1326 1327 # We want to make an HTTP/1.1 request, but the addinfourl 1328 # class isn't prepared to deal with a persistent connection. 1329 # It will try to read all remaining data from the socket, 1330 # which will block while the server waits for the next request. 1331 # So make sure the connection gets closed after the (only) 1332 # request. 1333 headers["Connection"] = "close" 1334 headers = {name.title(): val for name, val in headers.items()} 1335 1336 if req._tunnel_host: 1337 tunnel_headers = {} 1338 proxy_auth_hdr = "Proxy-Authorization" 1339 if proxy_auth_hdr in headers: 1340 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1341 # Proxy-Authorization should not be sent to origin 1342 # server. 1343 del headers[proxy_auth_hdr] 1344 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1345 1346 try: 1347 try: 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error 1351 raise URLError(err) 1352 r = h.getresponse() 1353 except: 1354 h.close() 1355 raise 1356 1357 # If the server does not send us a 'Connection: close' header, 1358 # HTTPConnection assumes the socket should be left open. Manually 1359 # mark the socket to be closed when this response object goes away. 1360 if h.sock: 1361 h.sock.close() 1362 h.sock = None 1363 1364 r.url = req.get_full_url() 1365 # This line replaces the .msg attribute of the HTTPResponse 1366 # with .headers, because urllib clients expect the response to 1367 # have the reason in .msg. It would be good to mark this 1368 # attribute is deprecated and get then to use info() or 1369 # .headers. 1370 r.msg = r.reason 1371 return r 1372 1373 1374class HTTPHandler(AbstractHTTPHandler): 1375 1376 def http_open(self, req): 1377 return self.do_open(http.client.HTTPConnection, req) 1378 1379 http_request = AbstractHTTPHandler.do_request_ 1380 1381if hasattr(http.client, 'HTTPSConnection'): 1382 1383 class HTTPSHandler(AbstractHTTPHandler): 1384 1385 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1386 AbstractHTTPHandler.__init__(self, debuglevel) 1387 self._context = context 1388 self._check_hostname = check_hostname 1389 1390 def https_open(self, req): 1391 return self.do_open(http.client.HTTPSConnection, req, 1392 context=self._context, check_hostname=self._check_hostname) 1393 1394 https_request = AbstractHTTPHandler.do_request_ 1395 1396 __all__.append('HTTPSHandler') 1397 1398class HTTPCookieProcessor(BaseHandler): 1399 def __init__(self, cookiejar=None): 1400 import http.cookiejar 1401 if cookiejar is None: 1402 cookiejar = http.cookiejar.CookieJar() 1403 self.cookiejar = cookiejar 1404 1405 def http_request(self, request): 1406 self.cookiejar.add_cookie_header(request) 1407 return request 1408 1409 def http_response(self, request, response): 1410 self.cookiejar.extract_cookies(response, request) 1411 return response 1412 1413 https_request = http_request 1414 https_response = http_response 1415 1416class UnknownHandler(BaseHandler): 1417 def unknown_open(self, req): 1418 type = req.type 1419 raise URLError('unknown url type: %s' % type) 1420 1421def parse_keqv_list(l): 1422 """Parse list of key=value strings where keys are not duplicated.""" 1423 parsed = {} 1424 for elt in l: 1425 k, v = elt.split('=', 1) 1426 if v[0] == '"' and v[-1] == '"': 1427 v = v[1:-1] 1428 parsed[k] = v 1429 return parsed 1430 1431def parse_http_list(s): 1432 """Parse lists as described by RFC 2068 Section 2. 1433 1434 In particular, parse comma-separated lists where the elements of 1435 the list may include quoted-strings. A quoted-string could 1436 contain a comma. A non-quoted string could have quotes in the 1437 middle. Neither commas nor quotes count if they are escaped. 1438 Only double-quotes count, not single-quotes. 1439 """ 1440 res = [] 1441 part = '' 1442 1443 escape = quote = False 1444 for cur in s: 1445 if escape: 1446 part += cur 1447 escape = False 1448 continue 1449 if quote: 1450 if cur == '\\': 1451 escape = True 1452 continue 1453 elif cur == '"': 1454 quote = False 1455 part += cur 1456 continue 1457 1458 if cur == ',': 1459 res.append(part) 1460 part = '' 1461 continue 1462 1463 if cur == '"': 1464 quote = True 1465 1466 part += cur 1467 1468 # append last part 1469 if part: 1470 res.append(part) 1471 1472 return [part.strip() for part in res] 1473 1474class FileHandler(BaseHandler): 1475 # Use local file or FTP depending on form of URL 1476 def file_open(self, req): 1477 url = req.selector 1478 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1479 req.host != 'localhost'): 1480 if not req.host in self.get_names(): 1481 raise URLError("file:// scheme is supported only on localhost") 1482 else: 1483 return self.open_local_file(req) 1484 1485 # names for the localhost 1486 names = None 1487 def get_names(self): 1488 if FileHandler.names is None: 1489 try: 1490 FileHandler.names = tuple( 1491 socket.gethostbyname_ex('localhost')[2] + 1492 socket.gethostbyname_ex(socket.gethostname())[2]) 1493 except socket.gaierror: 1494 FileHandler.names = (socket.gethostbyname('localhost'),) 1495 return FileHandler.names 1496 1497 # not entirely sure what the rules are here 1498 def open_local_file(self, req): 1499 import email.utils 1500 import mimetypes 1501 host = req.host 1502 filename = req.selector 1503 localfile = url2pathname(filename) 1504 try: 1505 stats = os.stat(localfile) 1506 size = stats.st_size 1507 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1508 mtype = mimetypes.guess_type(filename)[0] 1509 headers = email.message_from_string( 1510 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1511 (mtype or 'text/plain', size, modified)) 1512 if host: 1513 host, port = _splitport(host) 1514 if not host or \ 1515 (not port and _safe_gethostbyname(host) in self.get_names()): 1516 if host: 1517 origurl = 'file://' + host + filename 1518 else: 1519 origurl = 'file://' + filename 1520 return addinfourl(open(localfile, 'rb'), headers, origurl) 1521 except OSError as exp: 1522 raise URLError(exp) 1523 raise URLError('file not on local host') 1524 1525def _safe_gethostbyname(host): 1526 try: 1527 return socket.gethostbyname(host) 1528 except socket.gaierror: 1529 return None 1530 1531class FTPHandler(BaseHandler): 1532 def ftp_open(self, req): 1533 import ftplib 1534 import mimetypes 1535 host = req.host 1536 if not host: 1537 raise URLError('ftp error: no host given') 1538 host, port = _splitport(host) 1539 if port is None: 1540 port = ftplib.FTP_PORT 1541 else: 1542 port = int(port) 1543 1544 # username/password handling 1545 user, host = _splituser(host) 1546 if user: 1547 user, passwd = _splitpasswd(user) 1548 else: 1549 passwd = None 1550 host = unquote(host) 1551 user = user or '' 1552 passwd = passwd or '' 1553 1554 try: 1555 host = socket.gethostbyname(host) 1556 except OSError as msg: 1557 raise URLError(msg) 1558 path, attrs = _splitattr(req.selector) 1559 dirs = path.split('/') 1560 dirs = list(map(unquote, dirs)) 1561 dirs, file = dirs[:-1], dirs[-1] 1562 if dirs and not dirs[0]: 1563 dirs = dirs[1:] 1564 try: 1565 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1566 type = file and 'I' or 'D' 1567 for attr in attrs: 1568 attr, value = _splitvalue(attr) 1569 if attr.lower() == 'type' and \ 1570 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1571 type = value.upper() 1572 fp, retrlen = fw.retrfile(file, type) 1573 headers = "" 1574 mtype = mimetypes.guess_type(req.full_url)[0] 1575 if mtype: 1576 headers += "Content-type: %s\n" % mtype 1577 if retrlen is not None and retrlen >= 0: 1578 headers += "Content-length: %d\n" % retrlen 1579 headers = email.message_from_string(headers) 1580 return addinfourl(fp, headers, req.full_url) 1581 except ftplib.all_errors as exp: 1582 exc = URLError('ftp error: %r' % exp) 1583 raise exc.with_traceback(sys.exc_info()[2]) 1584 1585 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1586 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1587 persistent=False) 1588 1589class CacheFTPHandler(FTPHandler): 1590 # XXX would be nice to have pluggable cache strategies 1591 # XXX this stuff is definitely not thread safe 1592 def __init__(self): 1593 self.cache = {} 1594 self.timeout = {} 1595 self.soonest = 0 1596 self.delay = 60 1597 self.max_conns = 16 1598 1599 def setTimeout(self, t): 1600 self.delay = t 1601 1602 def setMaxConns(self, m): 1603 self.max_conns = m 1604 1605 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1606 key = user, host, port, '/'.join(dirs), timeout 1607 if key in self.cache: 1608 self.timeout[key] = time.time() + self.delay 1609 else: 1610 self.cache[key] = ftpwrapper(user, passwd, host, port, 1611 dirs, timeout) 1612 self.timeout[key] = time.time() + self.delay 1613 self.check_cache() 1614 return self.cache[key] 1615 1616 def check_cache(self): 1617 # first check for old ones 1618 t = time.time() 1619 if self.soonest <= t: 1620 for k, v in list(self.timeout.items()): 1621 if v < t: 1622 self.cache[k].close() 1623 del self.cache[k] 1624 del self.timeout[k] 1625 self.soonest = min(list(self.timeout.values())) 1626 1627 # then check the size 1628 if len(self.cache) == self.max_conns: 1629 for k, v in list(self.timeout.items()): 1630 if v == self.soonest: 1631 del self.cache[k] 1632 del self.timeout[k] 1633 break 1634 self.soonest = min(list(self.timeout.values())) 1635 1636 def clear_cache(self): 1637 for conn in self.cache.values(): 1638 conn.close() 1639 self.cache.clear() 1640 self.timeout.clear() 1641 1642class DataHandler(BaseHandler): 1643 def data_open(self, req): 1644 # data URLs as specified in RFC 2397. 1645 # 1646 # ignores POSTed data 1647 # 1648 # syntax: 1649 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1650 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1651 # data := *urlchar 1652 # parameter := attribute "=" value 1653 url = req.full_url 1654 1655 scheme, data = url.split(":",1) 1656 mediatype, data = data.split(",",1) 1657 1658 # even base64 encoded data URLs might be quoted so unquote in any case: 1659 data = unquote_to_bytes(data) 1660 if mediatype.endswith(";base64"): 1661 data = base64.decodebytes(data) 1662 mediatype = mediatype[:-7] 1663 1664 if not mediatype: 1665 mediatype = "text/plain;charset=US-ASCII" 1666 1667 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1668 (mediatype, len(data))) 1669 1670 return addinfourl(io.BytesIO(data), headers, url) 1671 1672 1673# Code move from the old urllib module 1674 1675MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1676 1677# Helper for non-unix systems 1678if os.name == 'nt': 1679 from nturl2path import url2pathname, pathname2url 1680else: 1681 def url2pathname(pathname): 1682 """OS-specific conversion from a relative URL of the 'file' scheme 1683 to a file system path; not recommended for general use.""" 1684 return unquote(pathname) 1685 1686 def pathname2url(pathname): 1687 """OS-specific conversion from a file system path to a relative URL 1688 of the 'file' scheme; not recommended for general use.""" 1689 return quote(pathname) 1690 1691 1692ftpcache = {} 1693 1694 1695class URLopener: 1696 """Class to open URLs. 1697 This is a class rather than just a subroutine because we may need 1698 more than one set of global protocol-specific options. 1699 Note -- this is a base class for those who don't want the 1700 automatic handling of errors type 302 (relocated) and 401 1701 (authorization needed).""" 1702 1703 __tempfiles = None 1704 1705 version = "Python-urllib/%s" % __version__ 1706 1707 # Constructor 1708 def __init__(self, proxies=None, **x509): 1709 msg = "%(class)s style of invoking requests is deprecated. " \ 1710 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1711 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1712 if proxies is None: 1713 proxies = getproxies() 1714 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1715 self.proxies = proxies 1716 self.key_file = x509.get('key_file') 1717 self.cert_file = x509.get('cert_file') 1718 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1719 self.__tempfiles = [] 1720 self.__unlink = os.unlink # See cleanup() 1721 self.tempcache = None 1722 # Undocumented feature: if you assign {} to tempcache, 1723 # it is used to cache files retrieved with 1724 # self.retrieve(). This is not enabled by default 1725 # since it does not work for changing documents (and I 1726 # haven't got the logic to check expiration headers 1727 # yet). 1728 self.ftpcache = ftpcache 1729 # Undocumented feature: you can use a different 1730 # ftp cache by assigning to the .ftpcache member; 1731 # in case you want logically independent URL openers 1732 # XXX This is not threadsafe. Bah. 1733 1734 def __del__(self): 1735 self.close() 1736 1737 def close(self): 1738 self.cleanup() 1739 1740 def cleanup(self): 1741 # This code sometimes runs when the rest of this module 1742 # has already been deleted, so it can't use any globals 1743 # or import anything. 1744 if self.__tempfiles: 1745 for file in self.__tempfiles: 1746 try: 1747 self.__unlink(file) 1748 except OSError: 1749 pass 1750 del self.__tempfiles[:] 1751 if self.tempcache: 1752 self.tempcache.clear() 1753 1754 def addheader(self, *args): 1755 """Add a header to be used by the HTTP interface only 1756 e.g. u.addheader('Accept', 'sound/basic')""" 1757 self.addheaders.append(args) 1758 1759 # External interface 1760 def open(self, fullurl, data=None): 1761 """Use URLopener().open(file) instead of open(file, 'r').""" 1762 fullurl = unwrap(_to_bytes(fullurl)) 1763 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1764 if self.tempcache and fullurl in self.tempcache: 1765 filename, headers = self.tempcache[fullurl] 1766 fp = open(filename, 'rb') 1767 return addinfourl(fp, headers, fullurl) 1768 urltype, url = _splittype(fullurl) 1769 if not urltype: 1770 urltype = 'file' 1771 if urltype in self.proxies: 1772 proxy = self.proxies[urltype] 1773 urltype, proxyhost = _splittype(proxy) 1774 host, selector = _splithost(proxyhost) 1775 url = (host, fullurl) # Signal special case to open_*() 1776 else: 1777 proxy = None 1778 name = 'open_' + urltype 1779 self.type = urltype 1780 name = name.replace('-', '_') 1781 if not hasattr(self, name) or name == 'open_local_file': 1782 if proxy: 1783 return self.open_unknown_proxy(proxy, fullurl, data) 1784 else: 1785 return self.open_unknown(fullurl, data) 1786 try: 1787 if data is None: 1788 return getattr(self, name)(url) 1789 else: 1790 return getattr(self, name)(url, data) 1791 except (HTTPError, URLError): 1792 raise 1793 except OSError as msg: 1794 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2]) 1795 1796 def open_unknown(self, fullurl, data=None): 1797 """Overridable interface to open unknown URL type.""" 1798 type, url = _splittype(fullurl) 1799 raise OSError('url error', 'unknown url type', type) 1800 1801 def open_unknown_proxy(self, proxy, fullurl, data=None): 1802 """Overridable interface to open unknown URL type.""" 1803 type, url = _splittype(fullurl) 1804 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1805 1806 # External interface 1807 def retrieve(self, url, filename=None, reporthook=None, data=None): 1808 """retrieve(url) returns (filename, headers) for a local object 1809 or (tempfilename, headers) for a remote object.""" 1810 url = unwrap(_to_bytes(url)) 1811 if self.tempcache and url in self.tempcache: 1812 return self.tempcache[url] 1813 type, url1 = _splittype(url) 1814 if filename is None and (not type or type == 'file'): 1815 try: 1816 fp = self.open_local_file(url1) 1817 hdrs = fp.info() 1818 fp.close() 1819 return url2pathname(_splithost(url1)[1]), hdrs 1820 except OSError: 1821 pass 1822 fp = self.open(url, data) 1823 try: 1824 headers = fp.info() 1825 if filename: 1826 tfp = open(filename, 'wb') 1827 else: 1828 garbage, path = _splittype(url) 1829 garbage, path = _splithost(path or "") 1830 path, garbage = _splitquery(path or "") 1831 path, garbage = _splitattr(path or "") 1832 suffix = os.path.splitext(path)[1] 1833 (fd, filename) = tempfile.mkstemp(suffix) 1834 self.__tempfiles.append(filename) 1835 tfp = os.fdopen(fd, 'wb') 1836 try: 1837 result = filename, headers 1838 if self.tempcache is not None: 1839 self.tempcache[url] = result 1840 bs = 1024*8 1841 size = -1 1842 read = 0 1843 blocknum = 0 1844 if "content-length" in headers: 1845 size = int(headers["Content-Length"]) 1846 if reporthook: 1847 reporthook(blocknum, bs, size) 1848 while 1: 1849 block = fp.read(bs) 1850 if not block: 1851 break 1852 read += len(block) 1853 tfp.write(block) 1854 blocknum += 1 1855 if reporthook: 1856 reporthook(blocknum, bs, size) 1857 finally: 1858 tfp.close() 1859 finally: 1860 fp.close() 1861 1862 # raise exception if actual size does not match content-length header 1863 if size >= 0 and read < size: 1864 raise ContentTooShortError( 1865 "retrieval incomplete: got only %i out of %i bytes" 1866 % (read, size), result) 1867 1868 return result 1869 1870 # Each method named open_<type> knows how to open that type of URL 1871 1872 def _open_generic_http(self, connection_factory, url, data): 1873 """Make an HTTP connection using connection_class. 1874 1875 This is an internal method that should be called from 1876 open_http() or open_https(). 1877 1878 Arguments: 1879 - connection_factory should take a host name and return an 1880 HTTPConnection instance. 1881 - url is the url to retrieval or a host, relative-path pair. 1882 - data is payload for a POST request or None. 1883 """ 1884 1885 user_passwd = None 1886 proxy_passwd= None 1887 if isinstance(url, str): 1888 host, selector = _splithost(url) 1889 if host: 1890 user_passwd, host = _splituser(host) 1891 host = unquote(host) 1892 realhost = host 1893 else: 1894 host, selector = url 1895 # check whether the proxy contains authorization information 1896 proxy_passwd, host = _splituser(host) 1897 # now we proceed with the url we want to obtain 1898 urltype, rest = _splittype(selector) 1899 url = rest 1900 user_passwd = None 1901 if urltype.lower() != 'http': 1902 realhost = None 1903 else: 1904 realhost, rest = _splithost(rest) 1905 if realhost: 1906 user_passwd, realhost = _splituser(realhost) 1907 if user_passwd: 1908 selector = "%s://%s%s" % (urltype, realhost, rest) 1909 if proxy_bypass(realhost): 1910 host = realhost 1911 1912 if not host: raise OSError('http error', 'no host given') 1913 1914 if proxy_passwd: 1915 proxy_passwd = unquote(proxy_passwd) 1916 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1917 else: 1918 proxy_auth = None 1919 1920 if user_passwd: 1921 user_passwd = unquote(user_passwd) 1922 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1923 else: 1924 auth = None 1925 http_conn = connection_factory(host) 1926 headers = {} 1927 if proxy_auth: 1928 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1929 if auth: 1930 headers["Authorization"] = "Basic %s" % auth 1931 if realhost: 1932 headers["Host"] = realhost 1933 1934 # Add Connection:close as we don't support persistent connections yet. 1935 # This helps in closing the socket and avoiding ResourceWarning 1936 1937 headers["Connection"] = "close" 1938 1939 for header, value in self.addheaders: 1940 headers[header] = value 1941 1942 if data is not None: 1943 headers["Content-Type"] = "application/x-www-form-urlencoded" 1944 http_conn.request("POST", selector, data, headers) 1945 else: 1946 http_conn.request("GET", selector, headers=headers) 1947 1948 try: 1949 response = http_conn.getresponse() 1950 except http.client.BadStatusLine: 1951 # something went wrong with the HTTP status line 1952 raise URLError("http protocol error: bad status line") 1953 1954 # According to RFC 2616, "2xx" code indicates that the client's 1955 # request was successfully received, understood, and accepted. 1956 if 200 <= response.status < 300: 1957 return addinfourl(response, response.msg, "http:" + url, 1958 response.status) 1959 else: 1960 return self.http_error( 1961 url, response.fp, 1962 response.status, response.reason, response.msg, data) 1963 1964 def open_http(self, url, data=None): 1965 """Use HTTP protocol.""" 1966 return self._open_generic_http(http.client.HTTPConnection, url, data) 1967 1968 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1969 """Handle http errors. 1970 1971 Derived class can override this, or provide specific handlers 1972 named http_error_DDD where DDD is the 3-digit error code.""" 1973 # First check if there's a specific handler for this error 1974 name = 'http_error_%d' % errcode 1975 if hasattr(self, name): 1976 method = getattr(self, name) 1977 if data is None: 1978 result = method(url, fp, errcode, errmsg, headers) 1979 else: 1980 result = method(url, fp, errcode, errmsg, headers, data) 1981 if result: return result 1982 return self.http_error_default(url, fp, errcode, errmsg, headers) 1983 1984 def http_error_default(self, url, fp, errcode, errmsg, headers): 1985 """Default error handler: close the connection and raise OSError.""" 1986 fp.close() 1987 raise HTTPError(url, errcode, errmsg, headers, None) 1988 1989 if _have_ssl: 1990 def _https_connection(self, host): 1991 return http.client.HTTPSConnection(host, 1992 key_file=self.key_file, 1993 cert_file=self.cert_file) 1994 1995 def open_https(self, url, data=None): 1996 """Use HTTPS protocol.""" 1997 return self._open_generic_http(self._https_connection, url, data) 1998 1999 def open_file(self, url): 2000 """Use local file or FTP depending on form of URL.""" 2001 if not isinstance(url, str): 2002 raise URLError('file error: proxy support for file protocol currently not implemented') 2003 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 2004 raise ValueError("file:// scheme is supported only on localhost") 2005 else: 2006 return self.open_local_file(url) 2007 2008 def open_local_file(self, url): 2009 """Use local file.""" 2010 import email.utils 2011 import mimetypes 2012 host, file = _splithost(url) 2013 localname = url2pathname(file) 2014 try: 2015 stats = os.stat(localname) 2016 except OSError as e: 2017 raise URLError(e.strerror, e.filename) 2018 size = stats.st_size 2019 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 2020 mtype = mimetypes.guess_type(url)[0] 2021 headers = email.message_from_string( 2022 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 2023 (mtype or 'text/plain', size, modified)) 2024 if not host: 2025 urlfile = file 2026 if file[:1] == '/': 2027 urlfile = 'file://' + file 2028 return addinfourl(open(localname, 'rb'), headers, urlfile) 2029 host, port = _splitport(host) 2030 if (not port 2031 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2032 urlfile = file 2033 if file[:1] == '/': 2034 urlfile = 'file://' + file 2035 elif file[:2] == './': 2036 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2037 return addinfourl(open(localname, 'rb'), headers, urlfile) 2038 raise URLError('local file error: not on local host') 2039 2040 def open_ftp(self, url): 2041 """Use FTP protocol.""" 2042 if not isinstance(url, str): 2043 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2044 import mimetypes 2045 host, path = _splithost(url) 2046 if not host: raise URLError('ftp error: no host given') 2047 host, port = _splitport(host) 2048 user, host = _splituser(host) 2049 if user: user, passwd = _splitpasswd(user) 2050 else: passwd = None 2051 host = unquote(host) 2052 user = unquote(user or '') 2053 passwd = unquote(passwd or '') 2054 host = socket.gethostbyname(host) 2055 if not port: 2056 import ftplib 2057 port = ftplib.FTP_PORT 2058 else: 2059 port = int(port) 2060 path, attrs = _splitattr(path) 2061 path = unquote(path) 2062 dirs = path.split('/') 2063 dirs, file = dirs[:-1], dirs[-1] 2064 if dirs and not dirs[0]: dirs = dirs[1:] 2065 if dirs and not dirs[0]: dirs[0] = '/' 2066 key = user, host, port, '/'.join(dirs) 2067 # XXX thread unsafe! 2068 if len(self.ftpcache) > MAXFTPCACHE: 2069 # Prune the cache, rather arbitrarily 2070 for k in list(self.ftpcache): 2071 if k != key: 2072 v = self.ftpcache[k] 2073 del self.ftpcache[k] 2074 v.close() 2075 try: 2076 if key not in self.ftpcache: 2077 self.ftpcache[key] = \ 2078 ftpwrapper(user, passwd, host, port, dirs) 2079 if not file: type = 'D' 2080 else: type = 'I' 2081 for attr in attrs: 2082 attr, value = _splitvalue(attr) 2083 if attr.lower() == 'type' and \ 2084 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2085 type = value.upper() 2086 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2087 mtype = mimetypes.guess_type("ftp:" + url)[0] 2088 headers = "" 2089 if mtype: 2090 headers += "Content-Type: %s\n" % mtype 2091 if retrlen is not None and retrlen >= 0: 2092 headers += "Content-Length: %d\n" % retrlen 2093 headers = email.message_from_string(headers) 2094 return addinfourl(fp, headers, "ftp:" + url) 2095 except ftperrors() as exp: 2096 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2]) 2097 2098 def open_data(self, url, data=None): 2099 """Use "data" URL.""" 2100 if not isinstance(url, str): 2101 raise URLError('data error: proxy support for data protocol currently not implemented') 2102 # ignore POSTed data 2103 # 2104 # syntax of data URLs: 2105 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2106 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2107 # data := *urlchar 2108 # parameter := attribute "=" value 2109 try: 2110 [type, data] = url.split(',', 1) 2111 except ValueError: 2112 raise OSError('data error', 'bad data URL') 2113 if not type: 2114 type = 'text/plain;charset=US-ASCII' 2115 semi = type.rfind(';') 2116 if semi >= 0 and '=' not in type[semi:]: 2117 encoding = type[semi+1:] 2118 type = type[:semi] 2119 else: 2120 encoding = '' 2121 msg = [] 2122 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2123 time.gmtime(time.time()))) 2124 msg.append('Content-type: %s' % type) 2125 if encoding == 'base64': 2126 # XXX is this encoding/decoding ok? 2127 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2128 else: 2129 data = unquote(data) 2130 msg.append('Content-Length: %d' % len(data)) 2131 msg.append('') 2132 msg.append(data) 2133 msg = '\n'.join(msg) 2134 headers = email.message_from_string(msg) 2135 f = io.StringIO(msg) 2136 #f.fileno = None # needed for addinfourl 2137 return addinfourl(f, headers, url) 2138 2139 2140class FancyURLopener(URLopener): 2141 """Derived class with handlers for errors we can handle (perhaps).""" 2142 2143 def __init__(self, *args, **kwargs): 2144 URLopener.__init__(self, *args, **kwargs) 2145 self.auth_cache = {} 2146 self.tries = 0 2147 self.maxtries = 10 2148 2149 def http_error_default(self, url, fp, errcode, errmsg, headers): 2150 """Default error handling -- don't raise an exception.""" 2151 return addinfourl(fp, headers, "http:" + url, errcode) 2152 2153 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2154 """Error 302 -- relocated (temporarily).""" 2155 self.tries += 1 2156 try: 2157 if self.maxtries and self.tries >= self.maxtries: 2158 if hasattr(self, "http_error_500"): 2159 meth = self.http_error_500 2160 else: 2161 meth = self.http_error_default 2162 return meth(url, fp, 500, 2163 "Internal Server Error: Redirect Recursion", 2164 headers) 2165 result = self.redirect_internal(url, fp, errcode, errmsg, 2166 headers, data) 2167 return result 2168 finally: 2169 self.tries = 0 2170 2171 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2172 if 'location' in headers: 2173 newurl = headers['location'] 2174 elif 'uri' in headers: 2175 newurl = headers['uri'] 2176 else: 2177 return 2178 fp.close() 2179 2180 # In case the server sent a relative URL, join with original: 2181 newurl = urljoin(self.type + ":" + url, newurl) 2182 2183 urlparts = urlparse(newurl) 2184 2185 # For security reasons, we don't allow redirection to anything other 2186 # than http, https and ftp. 2187 2188 # We are using newer HTTPError with older redirect_internal method 2189 # This older method will get deprecated in 3.3 2190 2191 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2192 raise HTTPError(newurl, errcode, 2193 errmsg + 2194 " Redirection to url '%s' is not allowed." % newurl, 2195 headers, fp) 2196 2197 return self.open(newurl) 2198 2199 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2200 """Error 301 -- also relocated (permanently).""" 2201 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2202 2203 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2204 """Error 303 -- also relocated (essentially identical to 302).""" 2205 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2206 2207 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2208 """Error 307 -- relocated, but turn POST into error.""" 2209 if data is None: 2210 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2211 else: 2212 return self.http_error_default(url, fp, errcode, errmsg, headers) 2213 2214 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2215 retry=False): 2216 """Error 401 -- authentication required. 2217 This function supports Basic authentication only.""" 2218 if 'www-authenticate' not in headers: 2219 URLopener.http_error_default(self, url, fp, 2220 errcode, errmsg, headers) 2221 stuff = headers['www-authenticate'] 2222 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2223 if not match: 2224 URLopener.http_error_default(self, url, fp, 2225 errcode, errmsg, headers) 2226 scheme, realm = match.groups() 2227 if scheme.lower() != 'basic': 2228 URLopener.http_error_default(self, url, fp, 2229 errcode, errmsg, headers) 2230 if not retry: 2231 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2232 headers) 2233 name = 'retry_' + self.type + '_basic_auth' 2234 if data is None: 2235 return getattr(self,name)(url, realm) 2236 else: 2237 return getattr(self,name)(url, realm, data) 2238 2239 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2240 retry=False): 2241 """Error 407 -- proxy authentication required. 2242 This function supports Basic authentication only.""" 2243 if 'proxy-authenticate' not in headers: 2244 URLopener.http_error_default(self, url, fp, 2245 errcode, errmsg, headers) 2246 stuff = headers['proxy-authenticate'] 2247 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2248 if not match: 2249 URLopener.http_error_default(self, url, fp, 2250 errcode, errmsg, headers) 2251 scheme, realm = match.groups() 2252 if scheme.lower() != 'basic': 2253 URLopener.http_error_default(self, url, fp, 2254 errcode, errmsg, headers) 2255 if not retry: 2256 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2257 headers) 2258 name = 'retry_proxy_' + self.type + '_basic_auth' 2259 if data is None: 2260 return getattr(self,name)(url, realm) 2261 else: 2262 return getattr(self,name)(url, realm, data) 2263 2264 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2265 host, selector = _splithost(url) 2266 newurl = 'http://' + host + selector 2267 proxy = self.proxies['http'] 2268 urltype, proxyhost = _splittype(proxy) 2269 proxyhost, proxyselector = _splithost(proxyhost) 2270 i = proxyhost.find('@') + 1 2271 proxyhost = proxyhost[i:] 2272 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2273 if not (user or passwd): return None 2274 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2275 quote(passwd, safe=''), proxyhost) 2276 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2277 if data is None: 2278 return self.open(newurl) 2279 else: 2280 return self.open(newurl, data) 2281 2282 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2283 host, selector = _splithost(url) 2284 newurl = 'https://' + host + selector 2285 proxy = self.proxies['https'] 2286 urltype, proxyhost = _splittype(proxy) 2287 proxyhost, proxyselector = _splithost(proxyhost) 2288 i = proxyhost.find('@') + 1 2289 proxyhost = proxyhost[i:] 2290 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2291 if not (user or passwd): return None 2292 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2293 quote(passwd, safe=''), proxyhost) 2294 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2295 if data is None: 2296 return self.open(newurl) 2297 else: 2298 return self.open(newurl, data) 2299 2300 def retry_http_basic_auth(self, url, realm, data=None): 2301 host, selector = _splithost(url) 2302 i = host.find('@') + 1 2303 host = host[i:] 2304 user, passwd = self.get_user_passwd(host, realm, i) 2305 if not (user or passwd): return None 2306 host = "%s:%s@%s" % (quote(user, safe=''), 2307 quote(passwd, safe=''), host) 2308 newurl = 'http://' + host + selector 2309 if data is None: 2310 return self.open(newurl) 2311 else: 2312 return self.open(newurl, data) 2313 2314 def retry_https_basic_auth(self, url, realm, data=None): 2315 host, selector = _splithost(url) 2316 i = host.find('@') + 1 2317 host = host[i:] 2318 user, passwd = self.get_user_passwd(host, realm, i) 2319 if not (user or passwd): return None 2320 host = "%s:%s@%s" % (quote(user, safe=''), 2321 quote(passwd, safe=''), host) 2322 newurl = 'https://' + host + selector 2323 if data is None: 2324 return self.open(newurl) 2325 else: 2326 return self.open(newurl, data) 2327 2328 def get_user_passwd(self, host, realm, clear_cache=0): 2329 key = realm + '@' + host.lower() 2330 if key in self.auth_cache: 2331 if clear_cache: 2332 del self.auth_cache[key] 2333 else: 2334 return self.auth_cache[key] 2335 user, passwd = self.prompt_user_passwd(host, realm) 2336 if user or passwd: self.auth_cache[key] = (user, passwd) 2337 return user, passwd 2338 2339 def prompt_user_passwd(self, host, realm): 2340 """Override this in a GUI environment!""" 2341 import getpass 2342 try: 2343 user = input("Enter username for %s at %s: " % (realm, host)) 2344 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2345 (user, realm, host)) 2346 return user, passwd 2347 except KeyboardInterrupt: 2348 print() 2349 return None, None 2350 2351 2352# Utility functions 2353 2354_localhost = None 2355def localhost(): 2356 """Return the IP address of the magic hostname 'localhost'.""" 2357 global _localhost 2358 if _localhost is None: 2359 _localhost = socket.gethostbyname('localhost') 2360 return _localhost 2361 2362_thishost = None 2363def thishost(): 2364 """Return the IP addresses of the current host.""" 2365 global _thishost 2366 if _thishost is None: 2367 try: 2368 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2369 except socket.gaierror: 2370 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2371 return _thishost 2372 2373_ftperrors = None 2374def ftperrors(): 2375 """Return the set of errors raised by the FTP class.""" 2376 global _ftperrors 2377 if _ftperrors is None: 2378 import ftplib 2379 _ftperrors = ftplib.all_errors 2380 return _ftperrors 2381 2382_noheaders = None 2383def noheaders(): 2384 """Return an empty email Message object.""" 2385 global _noheaders 2386 if _noheaders is None: 2387 _noheaders = email.message_from_string("") 2388 return _noheaders 2389 2390 2391# Utility classes 2392 2393class ftpwrapper: 2394 """Class used by open_ftp() for cache of open FTP connections.""" 2395 2396 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2397 persistent=True): 2398 self.user = user 2399 self.passwd = passwd 2400 self.host = host 2401 self.port = port 2402 self.dirs = dirs 2403 self.timeout = timeout 2404 self.refcount = 0 2405 self.keepalive = persistent 2406 try: 2407 self.init() 2408 except: 2409 self.close() 2410 raise 2411 2412 def init(self): 2413 import ftplib 2414 self.busy = 0 2415 self.ftp = ftplib.FTP() 2416 self.ftp.connect(self.host, self.port, self.timeout) 2417 self.ftp.login(self.user, self.passwd) 2418 _target = '/'.join(self.dirs) 2419 self.ftp.cwd(_target) 2420 2421 def retrfile(self, file, type): 2422 import ftplib 2423 self.endtransfer() 2424 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2425 else: cmd = 'TYPE ' + type; isdir = 0 2426 try: 2427 self.ftp.voidcmd(cmd) 2428 except ftplib.all_errors: 2429 self.init() 2430 self.ftp.voidcmd(cmd) 2431 conn = None 2432 if file and not isdir: 2433 # Try to retrieve as a file 2434 try: 2435 cmd = 'RETR ' + file 2436 conn, retrlen = self.ftp.ntransfercmd(cmd) 2437 except ftplib.error_perm as reason: 2438 if str(reason)[:3] != '550': 2439 raise URLError('ftp error: %r' % reason).with_traceback( 2440 sys.exc_info()[2]) 2441 if not conn: 2442 # Set transfer mode to ASCII! 2443 self.ftp.voidcmd('TYPE A') 2444 # Try a directory listing. Verify that directory exists. 2445 if file: 2446 pwd = self.ftp.pwd() 2447 try: 2448 try: 2449 self.ftp.cwd(file) 2450 except ftplib.error_perm as reason: 2451 raise URLError('ftp error: %r' % reason) from reason 2452 finally: 2453 self.ftp.cwd(pwd) 2454 cmd = 'LIST ' + file 2455 else: 2456 cmd = 'LIST' 2457 conn, retrlen = self.ftp.ntransfercmd(cmd) 2458 self.busy = 1 2459 2460 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2461 self.refcount += 1 2462 conn.close() 2463 # Pass back both a suitably decorated object and a retrieval length 2464 return (ftpobj, retrlen) 2465 2466 def endtransfer(self): 2467 self.busy = 0 2468 2469 def close(self): 2470 self.keepalive = False 2471 if self.refcount <= 0: 2472 self.real_close() 2473 2474 def file_close(self): 2475 self.endtransfer() 2476 self.refcount -= 1 2477 if self.refcount <= 0 and not self.keepalive: 2478 self.real_close() 2479 2480 def real_close(self): 2481 self.endtransfer() 2482 try: 2483 self.ftp.close() 2484 except ftperrors(): 2485 pass 2486 2487# Proxy handling 2488def getproxies_environment(): 2489 """Return a dictionary of scheme -> proxy server URL mappings. 2490 2491 Scan the environment for variables named <scheme>_proxy; 2492 this seems to be the standard convention. If you need a 2493 different way, you can pass a proxies dictionary to the 2494 [Fancy]URLopener constructor. 2495 2496 """ 2497 proxies = {} 2498 # in order to prefer lowercase variables, process environment in 2499 # two passes: first matches any, second pass matches lowercase only 2500 for name, value in os.environ.items(): 2501 name = name.lower() 2502 if value and name[-6:] == '_proxy': 2503 proxies[name[:-6]] = value 2504 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2505 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2506 # header from the client 2507 # If "proxy" is lowercase, it will still be used thanks to the next block 2508 if 'REQUEST_METHOD' in os.environ: 2509 proxies.pop('http', None) 2510 for name, value in os.environ.items(): 2511 if name[-6:] == '_proxy': 2512 name = name.lower() 2513 if value: 2514 proxies[name[:-6]] = value 2515 else: 2516 proxies.pop(name[:-6], None) 2517 return proxies 2518 2519def proxy_bypass_environment(host, proxies=None): 2520 """Test if proxies should not be used for a particular host. 2521 2522 Checks the proxy dict for the value of no_proxy, which should 2523 be a list of comma separated DNS suffixes, or '*' for all hosts. 2524 2525 """ 2526 if proxies is None: 2527 proxies = getproxies_environment() 2528 # don't bypass, if no_proxy isn't specified 2529 try: 2530 no_proxy = proxies['no'] 2531 except KeyError: 2532 return False 2533 # '*' is special case for always bypass 2534 if no_proxy == '*': 2535 return True 2536 host = host.lower() 2537 # strip port off host 2538 hostonly, port = _splitport(host) 2539 # check if the host ends with any of the DNS suffixes 2540 for name in no_proxy.split(','): 2541 name = name.strip() 2542 if name: 2543 name = name.lstrip('.') # ignore leading dots 2544 name = name.lower() 2545 if hostonly == name or host == name: 2546 return True 2547 name = '.' + name 2548 if hostonly.endswith(name) or host.endswith(name): 2549 return True 2550 # otherwise, don't bypass 2551 return False 2552 2553 2554# This code tests an OSX specific data structure but is testable on all 2555# platforms 2556def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2557 """ 2558 Return True iff this host shouldn't be accessed using a proxy 2559 2560 This function uses the MacOSX framework SystemConfiguration 2561 to fetch the proxy information. 2562 2563 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2564 { 'exclude_simple': bool, 2565 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2566 } 2567 """ 2568 from fnmatch import fnmatch 2569 2570 hostonly, port = _splitport(host) 2571 2572 def ip2num(ipAddr): 2573 parts = ipAddr.split('.') 2574 parts = list(map(int, parts)) 2575 if len(parts) != 4: 2576 parts = (parts + [0, 0, 0, 0])[:4] 2577 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2578 2579 # Check for simple host names: 2580 if '.' not in host: 2581 if proxy_settings['exclude_simple']: 2582 return True 2583 2584 hostIP = None 2585 2586 for value in proxy_settings.get('exceptions', ()): 2587 # Items in the list are strings like these: *.local, 169.254/16 2588 if not value: continue 2589 2590 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2591 if m is not None: 2592 if hostIP is None: 2593 try: 2594 hostIP = socket.gethostbyname(hostonly) 2595 hostIP = ip2num(hostIP) 2596 except OSError: 2597 continue 2598 2599 base = ip2num(m.group(1)) 2600 mask = m.group(2) 2601 if mask is None: 2602 mask = 8 * (m.group(1).count('.') + 1) 2603 else: 2604 mask = int(mask[1:]) 2605 2606 if mask < 0 or mask > 32: 2607 # System libraries ignore invalid prefix lengths 2608 continue 2609 2610 mask = 32 - mask 2611 2612 if (hostIP >> mask) == (base >> mask): 2613 return True 2614 2615 elif fnmatch(host, value): 2616 return True 2617 2618 return False 2619 2620 2621if sys.platform == 'darwin': 2622 from _scproxy import _get_proxy_settings, _get_proxies 2623 2624 def proxy_bypass_macosx_sysconf(host): 2625 proxy_settings = _get_proxy_settings() 2626 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2627 2628 def getproxies_macosx_sysconf(): 2629 """Return a dictionary of scheme -> proxy server URL mappings. 2630 2631 This function uses the MacOSX framework SystemConfiguration 2632 to fetch the proxy information. 2633 """ 2634 return _get_proxies() 2635 2636 2637 2638 def proxy_bypass(host): 2639 """Return True, if host should be bypassed. 2640 2641 Checks proxy settings gathered from the environment, if specified, 2642 or from the MacOSX framework SystemConfiguration. 2643 2644 """ 2645 proxies = getproxies_environment() 2646 if proxies: 2647 return proxy_bypass_environment(host, proxies) 2648 else: 2649 return proxy_bypass_macosx_sysconf(host) 2650 2651 def getproxies(): 2652 return getproxies_environment() or getproxies_macosx_sysconf() 2653 2654 2655elif os.name == 'nt': 2656 def getproxies_registry(): 2657 """Return a dictionary of scheme -> proxy server URL mappings. 2658 2659 Win32 uses the registry to store proxies. 2660 2661 """ 2662 proxies = {} 2663 try: 2664 import winreg 2665 except ImportError: 2666 # Std module, so should be around - but you never know! 2667 return proxies 2668 try: 2669 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2670 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2671 proxyEnable = winreg.QueryValueEx(internetSettings, 2672 'ProxyEnable')[0] 2673 if proxyEnable: 2674 # Returned as Unicode but problems if not converted to ASCII 2675 proxyServer = str(winreg.QueryValueEx(internetSettings, 2676 'ProxyServer')[0]) 2677 if '=' in proxyServer: 2678 # Per-protocol settings 2679 for p in proxyServer.split(';'): 2680 protocol, address = p.split('=', 1) 2681 # See if address has a type:// prefix 2682 if not re.match('(?:[^/:]+)://', address): 2683 address = '%s://%s' % (protocol, address) 2684 proxies[protocol] = address 2685 else: 2686 # Use one setting for all protocols 2687 if proxyServer[:5] == 'http:': 2688 proxies['http'] = proxyServer 2689 else: 2690 proxies['http'] = 'http://%s' % proxyServer 2691 proxies['https'] = 'https://%s' % proxyServer 2692 proxies['ftp'] = 'ftp://%s' % proxyServer 2693 internetSettings.Close() 2694 except (OSError, ValueError, TypeError): 2695 # Either registry key not found etc, or the value in an 2696 # unexpected format. 2697 # proxies already set up to be empty so nothing to do 2698 pass 2699 return proxies 2700 2701 def getproxies(): 2702 """Return a dictionary of scheme -> proxy server URL mappings. 2703 2704 Returns settings gathered from the environment, if specified, 2705 or the registry. 2706 2707 """ 2708 return getproxies_environment() or getproxies_registry() 2709 2710 def proxy_bypass_registry(host): 2711 try: 2712 import winreg 2713 except ImportError: 2714 # Std modules, so should be around - but you never know! 2715 return 0 2716 try: 2717 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2718 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2719 proxyEnable = winreg.QueryValueEx(internetSettings, 2720 'ProxyEnable')[0] 2721 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2722 'ProxyOverride')[0]) 2723 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2724 except OSError: 2725 return 0 2726 if not proxyEnable or not proxyOverride: 2727 return 0 2728 # try to make a host list from name and IP address. 2729 rawHost, port = _splitport(host) 2730 host = [rawHost] 2731 try: 2732 addr = socket.gethostbyname(rawHost) 2733 if addr != rawHost: 2734 host.append(addr) 2735 except OSError: 2736 pass 2737 try: 2738 fqdn = socket.getfqdn(rawHost) 2739 if fqdn != rawHost: 2740 host.append(fqdn) 2741 except OSError: 2742 pass 2743 # make a check value list from the registry entry: replace the 2744 # '<local>' string by the localhost entry and the corresponding 2745 # canonical entry. 2746 proxyOverride = proxyOverride.split(';') 2747 # now check if we match one of the registry values. 2748 for test in proxyOverride: 2749 if test == '<local>': 2750 if '.' not in rawHost: 2751 return 1 2752 test = test.replace(".", r"\.") # mask dots 2753 test = test.replace("*", r".*") # change glob sequence 2754 test = test.replace("?", r".") # change glob char 2755 for val in host: 2756 if re.match(test, val, re.I): 2757 return 1 2758 return 0 2759 2760 def proxy_bypass(host): 2761 """Return True, if host should be bypassed. 2762 2763 Checks proxy settings gathered from the environment, if specified, 2764 or the registry. 2765 2766 """ 2767 proxies = getproxies_environment() 2768 if proxies: 2769 return proxy_bypass_environment(host, proxies) 2770 else: 2771 return proxy_bypass_registry(host) 2772 2773else: 2774 # By default use environment variables 2775 getproxies = getproxies_environment 2776 proxy_bypass = proxy_bypass_environment 2777