1"""An extensible library for opening URLs using a variety of protocols 2 3The simplest way to use this module is to call the urlopen function, 4which accepts a string containing a URL or a Request object (described 5below). It opens the URL and returns the results as file-like 6object; the returned object has some extra methods described below. 7 8The OpenerDirector manages a collection of Handler objects that do 9all the actual work. Each Handler implements a particular protocol or 10option. The OpenerDirector is a composite object that invokes the 11Handlers needed to open the requested URL. For example, the 12HTTPHandler performs HTTP GET and POST requests and deals with 13non-error returns. The HTTPRedirectHandler automatically deals with 14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15deals with digest authentication. 16 17urlopen(url, data=None) -- Basic usage is the same as original 18urllib. pass the url and optionally data to post to an HTTP URL, and 19get a file-like object back. One difference is that you can also pass 20a Request instance instead of URL. Raises a URLError (subclass of 21OSError); for HTTP errors, raises an HTTPError, which can also be 22treated as a valid response. 23 24build_opener -- Function that creates a new OpenerDirector instance. 25Will install the default handlers. Accepts one or more Handlers as 26arguments, either instances or Handler classes that it will 27instantiate. If one of the argument is a subclass of the default 28handler, the argument will be installed instead of the default. 29 30install_opener -- Installs a new opener as the default opener. 31 32objects of interest: 33 34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35the Handler classes, while dealing with requests and responses. 36 37Request -- An object that encapsulates the state of a request. The 38state can be as simple as the URL. It can also include extra HTTP 39headers, e.g. a User-Agent. 40 41BaseHandler -- 42 43internals: 44BaseHandler and parent 45_call_chain conventions 46 47Example usage: 48 49import urllib.request 50 51# set up authentication info 52authinfo = urllib.request.HTTPBasicAuthHandler() 53authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60# build a new opener that adds authentication and caching FTP handlers 61opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64# install it 65urllib.request.install_opener(opener) 66 67f = urllib.request.urlopen('http://www.python.org/') 68""" 69 70# XXX issues: 71# If an authentication error handler that tries to perform 72# authentication for some reason but fails, how should the error be 73# signalled? The client needs to know the HTTP error code. But if 74# the handler knows that the problem was, e.g., that it didn't know 75# that hash algo that requested in the challenge, it would be good to 76# pass that information along to the client, too. 77# ftp errors aren't handled cleanly 78# check digest against correct (i.e. non-apache) implementation 79 80# Possible extensions: 81# complex proxies XXX not sure what exactly was meant by this 82# abstract factory for opener 83 84import base64 85import bisect 86import email 87import hashlib 88import http.client 89import io 90import os 91import posixpath 92import re 93import socket 94import string 95import sys 96import time 97import tempfile 98import contextlib 99import warnings 100 101 102from urllib.error import URLError, HTTPError, ContentTooShortError 103from urllib.parse import ( 104 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 105 splittype, splithost, splitport, splituser, splitpasswd, 106 splitattr, splitquery, splitvalue, splittag, to_bytes, 107 unquote_to_bytes, urlunparse) 108from urllib.response import addinfourl, addclosehook 109 110# check for SSL 111try: 112 import ssl 113except ImportError: 114 _have_ssl = False 115else: 116 _have_ssl = True 117 118__all__ = [ 119 # Classes 120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 127 'UnknownHandler', 'HTTPErrorProcessor', 128 # Functions 129 'urlopen', 'install_opener', 'build_opener', 130 'pathname2url', 'url2pathname', 'getproxies', 131 # Legacy interface 132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 133] 134 135# used in User-Agent header sent 136__version__ = '%d.%d' % sys.version_info[:2] 137 138_opener = None 139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 140 *, cafile=None, capath=None, cadefault=False, context=None): 141 '''Open the URL url, which can be either a string or a Request object. 142 143 *data* must be an object specifying additional data to be sent to 144 the server, or None if no such data is needed. See Request for 145 details. 146 147 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 148 header in its HTTP requests. 149 150 The optional *timeout* parameter specifies a timeout in seconds for 151 blocking operations like the connection attempt (if not specified, the 152 global default timeout setting will be used). This only works for HTTP, 153 HTTPS and FTP connections. 154 155 If *context* is specified, it must be a ssl.SSLContext instance describing 156 the various SSL options. See HTTPSConnection for more details. 157 158 The optional *cafile* and *capath* parameters specify a set of trusted CA 159 certificates for HTTPS requests. cafile should point to a single file 160 containing a bundle of CA certificates, whereas capath should point to a 161 directory of hashed certificate files. More information can be found in 162 ssl.SSLContext.load_verify_locations(). 163 164 The *cadefault* parameter is ignored. 165 166 This function always returns an object which can work as a context 167 manager and has methods such as 168 169 * geturl() - return the URL of the resource retrieved, commonly used to 170 determine if a redirect was followed 171 172 * info() - return the meta-information of the page, such as headers, in the 173 form of an email.message_from_string() instance (see Quick Reference to 174 HTTP Headers) 175 176 * getcode() - return the HTTP status code of the response. Raises URLError 177 on errors. 178 179 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 180 object slightly modified. In addition to the three new methods above, the 181 msg attribute contains the same information as the reason attribute --- 182 the reason phrase returned by the server --- instead of the response 183 headers as it is specified in the documentation for HTTPResponse. 184 185 For FTP, file, and data URLs and requests explicitly handled by legacy 186 URLopener and FancyURLopener classes, this function returns a 187 urllib.response.addinfourl object. 188 189 Note that None may be returned if no handler handles the request (though 190 the default installed global OpenerDirector uses UnknownHandler to ensure 191 this never happens). 192 193 In addition, if proxy settings are detected (for example, when a *_proxy 194 environment variable like http_proxy is set), ProxyHandler is default 195 installed and makes sure the requests are handled through the proxy. 196 197 ''' 198 global _opener 199 if cafile or capath or cadefault: 200 import warnings 201 warnings.warn("cafile, capath and cadefault are deprecated, use a " 202 "custom context instead.", DeprecationWarning, 2) 203 if context is not None: 204 raise ValueError( 205 "You can't pass both context and any of cafile, capath, and " 206 "cadefault" 207 ) 208 if not _have_ssl: 209 raise ValueError('SSL support not available') 210 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, 211 cafile=cafile, 212 capath=capath) 213 https_handler = HTTPSHandler(context=context) 214 opener = build_opener(https_handler) 215 elif context: 216 https_handler = HTTPSHandler(context=context) 217 opener = build_opener(https_handler) 218 elif _opener is None: 219 _opener = opener = build_opener() 220 else: 221 opener = _opener 222 return opener.open(url, data, timeout) 223 224def install_opener(opener): 225 global _opener 226 _opener = opener 227 228_url_tempfiles = [] 229def urlretrieve(url, filename=None, reporthook=None, data=None): 230 """ 231 Retrieve a URL into a temporary location on disk. 232 233 Requires a URL argument. If a filename is passed, it is used as 234 the temporary file location. The reporthook argument should be 235 a callable that accepts a block number, a read size, and the 236 total file size of the URL target. The data argument should be 237 valid URL encoded data. 238 239 If a filename is passed and the URL points to a local resource, 240 the result is a copy from local file to new file. 241 242 Returns a tuple containing the path to the newly created 243 data file as well as the resulting HTTPMessage object. 244 """ 245 url_type, path = splittype(url) 246 247 with contextlib.closing(urlopen(url, data)) as fp: 248 headers = fp.info() 249 250 # Just return the local path and the "headers" for file:// 251 # URLs. No sense in performing a copy unless requested. 252 if url_type == "file" and not filename: 253 return os.path.normpath(path), headers 254 255 # Handle temporary file setup. 256 if filename: 257 tfp = open(filename, 'wb') 258 else: 259 tfp = tempfile.NamedTemporaryFile(delete=False) 260 filename = tfp.name 261 _url_tempfiles.append(filename) 262 263 with tfp: 264 result = filename, headers 265 bs = 1024*8 266 size = -1 267 read = 0 268 blocknum = 0 269 if "content-length" in headers: 270 size = int(headers["Content-Length"]) 271 272 if reporthook: 273 reporthook(blocknum, bs, size) 274 275 while True: 276 block = fp.read(bs) 277 if not block: 278 break 279 read += len(block) 280 tfp.write(block) 281 blocknum += 1 282 if reporthook: 283 reporthook(blocknum, bs, size) 284 285 if size >= 0 and read < size: 286 raise ContentTooShortError( 287 "retrieval incomplete: got only %i out of %i bytes" 288 % (read, size), result) 289 290 return result 291 292def urlcleanup(): 293 """Clean up temporary files from urlretrieve calls.""" 294 for temp_file in _url_tempfiles: 295 try: 296 os.unlink(temp_file) 297 except OSError: 298 pass 299 300 del _url_tempfiles[:] 301 global _opener 302 if _opener: 303 _opener = None 304 305# copied from cookielib.py 306_cut_port_re = re.compile(r":\d+$", re.ASCII) 307def request_host(request): 308 """Return request-host, as defined by RFC 2965. 309 310 Variation from RFC: returned value is lowercased, for convenient 311 comparison. 312 313 """ 314 url = request.full_url 315 host = urlparse(url)[1] 316 if host == "": 317 host = request.get_header("Host", "") 318 319 # remove port, if present 320 host = _cut_port_re.sub("", host, 1) 321 return host.lower() 322 323class Request: 324 325 def __init__(self, url, data=None, headers={}, 326 origin_req_host=None, unverifiable=False, 327 method=None): 328 self.full_url = url 329 self.headers = {} 330 self.unredirected_hdrs = {} 331 self._data = None 332 self.data = data 333 self._tunnel_host = None 334 for key, value in headers.items(): 335 self.add_header(key, value) 336 if origin_req_host is None: 337 origin_req_host = request_host(self) 338 self.origin_req_host = origin_req_host 339 self.unverifiable = unverifiable 340 if method: 341 self.method = method 342 343 @property 344 def full_url(self): 345 if self.fragment: 346 return '{}#{}'.format(self._full_url, self.fragment) 347 return self._full_url 348 349 @full_url.setter 350 def full_url(self, url): 351 # unwrap('<URL:type://host/path>') --> 'type://host/path' 352 self._full_url = unwrap(url) 353 self._full_url, self.fragment = splittag(self._full_url) 354 self._parse() 355 356 @full_url.deleter 357 def full_url(self): 358 self._full_url = None 359 self.fragment = None 360 self.selector = '' 361 362 @property 363 def data(self): 364 return self._data 365 366 @data.setter 367 def data(self, data): 368 if data != self._data: 369 self._data = data 370 # issue 16464 371 # if we change data we need to remove content-length header 372 # (cause it's most probably calculated for previous value) 373 if self.has_header("Content-length"): 374 self.remove_header("Content-length") 375 376 @data.deleter 377 def data(self): 378 self.data = None 379 380 def _parse(self): 381 self.type, rest = splittype(self._full_url) 382 if self.type is None: 383 raise ValueError("unknown url type: %r" % self.full_url) 384 self.host, self.selector = splithost(rest) 385 if self.host: 386 self.host = unquote(self.host) 387 388 def get_method(self): 389 """Return a string indicating the HTTP request method.""" 390 default_method = "POST" if self.data is not None else "GET" 391 return getattr(self, 'method', default_method) 392 393 def get_full_url(self): 394 return self.full_url 395 396 def set_proxy(self, host, type): 397 if self.type == 'https' and not self._tunnel_host: 398 self._tunnel_host = self.host 399 else: 400 self.type= type 401 self.selector = self.full_url 402 self.host = host 403 404 def has_proxy(self): 405 return self.selector == self.full_url 406 407 def add_header(self, key, val): 408 # useful for something like authentication 409 self.headers[key.capitalize()] = val 410 411 def add_unredirected_header(self, key, val): 412 # will not be added to a redirected request 413 self.unredirected_hdrs[key.capitalize()] = val 414 415 def has_header(self, header_name): 416 return (header_name in self.headers or 417 header_name in self.unredirected_hdrs) 418 419 def get_header(self, header_name, default=None): 420 return self.headers.get( 421 header_name, 422 self.unredirected_hdrs.get(header_name, default)) 423 424 def remove_header(self, header_name): 425 self.headers.pop(header_name, None) 426 self.unredirected_hdrs.pop(header_name, None) 427 428 def header_items(self): 429 hdrs = self.unredirected_hdrs.copy() 430 hdrs.update(self.headers) 431 return list(hdrs.items()) 432 433class OpenerDirector: 434 def __init__(self): 435 client_version = "Python-urllib/%s" % __version__ 436 self.addheaders = [('User-agent', client_version)] 437 # self.handlers is retained only for backward compatibility 438 self.handlers = [] 439 # manage the individual handlers 440 self.handle_open = {} 441 self.handle_error = {} 442 self.process_response = {} 443 self.process_request = {} 444 445 def add_handler(self, handler): 446 if not hasattr(handler, "add_parent"): 447 raise TypeError("expected BaseHandler instance, got %r" % 448 type(handler)) 449 450 added = False 451 for meth in dir(handler): 452 if meth in ["redirect_request", "do_open", "proxy_open"]: 453 # oops, coincidental match 454 continue 455 456 i = meth.find("_") 457 protocol = meth[:i] 458 condition = meth[i+1:] 459 460 if condition.startswith("error"): 461 j = condition.find("_") + i + 1 462 kind = meth[j+1:] 463 try: 464 kind = int(kind) 465 except ValueError: 466 pass 467 lookup = self.handle_error.get(protocol, {}) 468 self.handle_error[protocol] = lookup 469 elif condition == "open": 470 kind = protocol 471 lookup = self.handle_open 472 elif condition == "response": 473 kind = protocol 474 lookup = self.process_response 475 elif condition == "request": 476 kind = protocol 477 lookup = self.process_request 478 else: 479 continue 480 481 handlers = lookup.setdefault(kind, []) 482 if handlers: 483 bisect.insort(handlers, handler) 484 else: 485 handlers.append(handler) 486 added = True 487 488 if added: 489 bisect.insort(self.handlers, handler) 490 handler.add_parent(self) 491 492 def close(self): 493 # Only exists for backwards compatibility. 494 pass 495 496 def _call_chain(self, chain, kind, meth_name, *args): 497 # Handlers raise an exception if no one else should try to handle 498 # the request, or return None if they can't but another handler 499 # could. Otherwise, they return the response. 500 handlers = chain.get(kind, ()) 501 for handler in handlers: 502 func = getattr(handler, meth_name) 503 result = func(*args) 504 if result is not None: 505 return result 506 507 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 508 # accept a URL or a Request object 509 if isinstance(fullurl, str): 510 req = Request(fullurl, data) 511 else: 512 req = fullurl 513 if data is not None: 514 req.data = data 515 516 req.timeout = timeout 517 protocol = req.type 518 519 # pre-process request 520 meth_name = protocol+"_request" 521 for processor in self.process_request.get(protocol, []): 522 meth = getattr(processor, meth_name) 523 req = meth(req) 524 525 response = self._open(req, data) 526 527 # post-process response 528 meth_name = protocol+"_response" 529 for processor in self.process_response.get(protocol, []): 530 meth = getattr(processor, meth_name) 531 response = meth(req, response) 532 533 return response 534 535 def _open(self, req, data=None): 536 result = self._call_chain(self.handle_open, 'default', 537 'default_open', req) 538 if result: 539 return result 540 541 protocol = req.type 542 result = self._call_chain(self.handle_open, protocol, protocol + 543 '_open', req) 544 if result: 545 return result 546 547 return self._call_chain(self.handle_open, 'unknown', 548 'unknown_open', req) 549 550 def error(self, proto, *args): 551 if proto in ('http', 'https'): 552 # XXX http[s] protocols are special-cased 553 dict = self.handle_error['http'] # https is not different than http 554 proto = args[2] # YUCK! 555 meth_name = 'http_error_%s' % proto 556 http_err = 1 557 orig_args = args 558 else: 559 dict = self.handle_error 560 meth_name = proto + '_error' 561 http_err = 0 562 args = (dict, proto, meth_name) + args 563 result = self._call_chain(*args) 564 if result: 565 return result 566 567 if http_err: 568 args = (dict, 'default', 'http_error_default') + orig_args 569 return self._call_chain(*args) 570 571# XXX probably also want an abstract factory that knows when it makes 572# sense to skip a superclass in favor of a subclass and when it might 573# make sense to include both 574 575def build_opener(*handlers): 576 """Create an opener object from a list of handlers. 577 578 The opener will use several default handlers, including support 579 for HTTP, FTP and when applicable HTTPS. 580 581 If any of the handlers passed as arguments are subclasses of the 582 default handlers, the default handlers will not be used. 583 """ 584 opener = OpenerDirector() 585 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 586 HTTPDefaultErrorHandler, HTTPRedirectHandler, 587 FTPHandler, FileHandler, HTTPErrorProcessor, 588 DataHandler] 589 if hasattr(http.client, "HTTPSConnection"): 590 default_classes.append(HTTPSHandler) 591 skip = set() 592 for klass in default_classes: 593 for check in handlers: 594 if isinstance(check, type): 595 if issubclass(check, klass): 596 skip.add(klass) 597 elif isinstance(check, klass): 598 skip.add(klass) 599 for klass in skip: 600 default_classes.remove(klass) 601 602 for klass in default_classes: 603 opener.add_handler(klass()) 604 605 for h in handlers: 606 if isinstance(h, type): 607 h = h() 608 opener.add_handler(h) 609 return opener 610 611class BaseHandler: 612 handler_order = 500 613 614 def add_parent(self, parent): 615 self.parent = parent 616 617 def close(self): 618 # Only exists for backwards compatibility 619 pass 620 621 def __lt__(self, other): 622 if not hasattr(other, "handler_order"): 623 # Try to preserve the old behavior of having custom classes 624 # inserted after default ones (works only for custom user 625 # classes which are not aware of handler_order). 626 return True 627 return self.handler_order < other.handler_order 628 629 630class HTTPErrorProcessor(BaseHandler): 631 """Process HTTP error responses.""" 632 handler_order = 1000 # after all other processing 633 634 def http_response(self, request, response): 635 code, msg, hdrs = response.code, response.msg, response.info() 636 637 # According to RFC 2616, "2xx" code indicates that the client's 638 # request was successfully received, understood, and accepted. 639 if not (200 <= code < 300): 640 response = self.parent.error( 641 'http', request, response, code, msg, hdrs) 642 643 return response 644 645 https_response = http_response 646 647class HTTPDefaultErrorHandler(BaseHandler): 648 def http_error_default(self, req, fp, code, msg, hdrs): 649 raise HTTPError(req.full_url, code, msg, hdrs, fp) 650 651class HTTPRedirectHandler(BaseHandler): 652 # maximum number of redirections to any single URL 653 # this is needed because of the state that cookies introduce 654 max_repeats = 4 655 # maximum total number of redirections (regardless of URL) before 656 # assuming we're in a loop 657 max_redirections = 10 658 659 def redirect_request(self, req, fp, code, msg, headers, newurl): 660 """Return a Request or None in response to a redirect. 661 662 This is called by the http_error_30x methods when a 663 redirection response is received. If a redirection should 664 take place, return a new Request to allow http_error_30x to 665 perform the redirect. Otherwise, raise HTTPError if no-one 666 else should try to handle this url. Return None if you can't 667 but another Handler might. 668 """ 669 m = req.get_method() 670 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 671 or code in (301, 302, 303) and m == "POST")): 672 raise HTTPError(req.full_url, code, msg, headers, fp) 673 674 # Strictly (according to RFC 2616), 301 or 302 in response to 675 # a POST MUST NOT cause a redirection without confirmation 676 # from the user (of urllib.request, in this case). In practice, 677 # essentially all clients do redirect in this case, so we do 678 # the same. 679 680 # Be conciliant with URIs containing a space. This is mainly 681 # redundant with the more complete encoding done in http_error_302(), 682 # but it is kept for compatibility with other callers. 683 newurl = newurl.replace(' ', '%20') 684 685 CONTENT_HEADERS = ("content-length", "content-type") 686 newheaders = {k: v for k, v in req.headers.items() 687 if k.lower() not in CONTENT_HEADERS} 688 return Request(newurl, 689 headers=newheaders, 690 origin_req_host=req.origin_req_host, 691 unverifiable=True) 692 693 # Implementation note: To avoid the server sending us into an 694 # infinite loop, the request object needs to track what URLs we 695 # have already seen. Do this by adding a handler-specific 696 # attribute to the Request object. 697 def http_error_302(self, req, fp, code, msg, headers): 698 # Some servers (incorrectly) return multiple Location headers 699 # (so probably same goes for URI). Use first header. 700 if "location" in headers: 701 newurl = headers["location"] 702 elif "uri" in headers: 703 newurl = headers["uri"] 704 else: 705 return 706 707 # fix a possible malformed URL 708 urlparts = urlparse(newurl) 709 710 # For security reasons we don't allow redirection to anything other 711 # than http, https or ftp. 712 713 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 714 raise HTTPError( 715 newurl, code, 716 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 717 headers, fp) 718 719 if not urlparts.path and urlparts.netloc: 720 urlparts = list(urlparts) 721 urlparts[2] = "/" 722 newurl = urlunparse(urlparts) 723 724 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 725 # original bytes and percent-encode non-ASCII bytes, and any special 726 # characters such as the space. 727 newurl = quote( 728 newurl, encoding="iso-8859-1", safe=string.punctuation) 729 newurl = urljoin(req.full_url, newurl) 730 731 # XXX Probably want to forget about the state of the current 732 # request, although that might interact poorly with other 733 # handlers that also use handler-specific request attributes 734 new = self.redirect_request(req, fp, code, msg, headers, newurl) 735 if new is None: 736 return 737 738 # loop detection 739 # .redirect_dict has a key url if url was previously visited. 740 if hasattr(req, 'redirect_dict'): 741 visited = new.redirect_dict = req.redirect_dict 742 if (visited.get(newurl, 0) >= self.max_repeats or 743 len(visited) >= self.max_redirections): 744 raise HTTPError(req.full_url, code, 745 self.inf_msg + msg, headers, fp) 746 else: 747 visited = new.redirect_dict = req.redirect_dict = {} 748 visited[newurl] = visited.get(newurl, 0) + 1 749 750 # Don't close the fp until we are sure that we won't use it 751 # with HTTPError. 752 fp.read() 753 fp.close() 754 755 return self.parent.open(new, timeout=req.timeout) 756 757 http_error_301 = http_error_303 = http_error_307 = http_error_302 758 759 inf_msg = "The HTTP server returned a redirect error that would " \ 760 "lead to an infinite loop.\n" \ 761 "The last 30x error message was:\n" 762 763 764def _parse_proxy(proxy): 765 """Return (scheme, user, password, host/port) given a URL or an authority. 766 767 If a URL is supplied, it must have an authority (host:port) component. 768 According to RFC 3986, having an authority component means the URL must 769 have two slashes after the scheme. 770 """ 771 scheme, r_scheme = splittype(proxy) 772 if not r_scheme.startswith("/"): 773 # authority 774 scheme = None 775 authority = proxy 776 else: 777 # URL 778 if not r_scheme.startswith("//"): 779 raise ValueError("proxy URL with no authority: %r" % proxy) 780 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 781 # and 3.3.), path is empty or starts with '/' 782 end = r_scheme.find("/", 2) 783 if end == -1: 784 end = None 785 authority = r_scheme[2:end] 786 userinfo, hostport = splituser(authority) 787 if userinfo is not None: 788 user, password = splitpasswd(userinfo) 789 else: 790 user = password = None 791 return scheme, user, password, hostport 792 793class ProxyHandler(BaseHandler): 794 # Proxies must be in front 795 handler_order = 100 796 797 def __init__(self, proxies=None): 798 if proxies is None: 799 proxies = getproxies() 800 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 801 self.proxies = proxies 802 for type, url in proxies.items(): 803 setattr(self, '%s_open' % type, 804 lambda r, proxy=url, type=type, meth=self.proxy_open: 805 meth(r, proxy, type)) 806 807 def proxy_open(self, req, proxy, type): 808 orig_type = req.type 809 proxy_type, user, password, hostport = _parse_proxy(proxy) 810 if proxy_type is None: 811 proxy_type = orig_type 812 813 if req.host and proxy_bypass(req.host): 814 return None 815 816 if user and password: 817 user_pass = '%s:%s' % (unquote(user), 818 unquote(password)) 819 creds = base64.b64encode(user_pass.encode()).decode("ascii") 820 req.add_header('Proxy-authorization', 'Basic ' + creds) 821 hostport = unquote(hostport) 822 req.set_proxy(hostport, proxy_type) 823 if orig_type == proxy_type or orig_type == 'https': 824 # let other handlers take care of it 825 return None 826 else: 827 # need to start over, because the other handlers don't 828 # grok the proxy's URL type 829 # e.g. if we have a constructor arg proxies like so: 830 # {'http': 'ftp://proxy.example.com'}, we may end up turning 831 # a request for http://acme.example.com/a into one for 832 # ftp://proxy.example.com/a 833 return self.parent.open(req, timeout=req.timeout) 834 835class HTTPPasswordMgr: 836 837 def __init__(self): 838 self.passwd = {} 839 840 def add_password(self, realm, uri, user, passwd): 841 # uri could be a single URI or a sequence 842 if isinstance(uri, str): 843 uri = [uri] 844 if realm not in self.passwd: 845 self.passwd[realm] = {} 846 for default_port in True, False: 847 reduced_uri = tuple( 848 self.reduce_uri(u, default_port) for u in uri) 849 self.passwd[realm][reduced_uri] = (user, passwd) 850 851 def find_user_password(self, realm, authuri): 852 domains = self.passwd.get(realm, {}) 853 for default_port in True, False: 854 reduced_authuri = self.reduce_uri(authuri, default_port) 855 for uris, authinfo in domains.items(): 856 for uri in uris: 857 if self.is_suburi(uri, reduced_authuri): 858 return authinfo 859 return None, None 860 861 def reduce_uri(self, uri, default_port=True): 862 """Accept authority or URI and extract only the authority and path.""" 863 # note HTTP URLs do not have a userinfo component 864 parts = urlsplit(uri) 865 if parts[1]: 866 # URI 867 scheme = parts[0] 868 authority = parts[1] 869 path = parts[2] or '/' 870 else: 871 # host or host:port 872 scheme = None 873 authority = uri 874 path = '/' 875 host, port = splitport(authority) 876 if default_port and port is None and scheme is not None: 877 dport = {"http": 80, 878 "https": 443, 879 }.get(scheme) 880 if dport is not None: 881 authority = "%s:%d" % (host, dport) 882 return authority, path 883 884 def is_suburi(self, base, test): 885 """Check if test is below base in a URI tree 886 887 Both args must be URIs in reduced form. 888 """ 889 if base == test: 890 return True 891 if base[0] != test[0]: 892 return False 893 common = posixpath.commonprefix((base[1], test[1])) 894 if len(common) == len(base[1]): 895 return True 896 return False 897 898 899class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 900 901 def find_user_password(self, realm, authuri): 902 user, password = HTTPPasswordMgr.find_user_password(self, realm, 903 authuri) 904 if user is not None: 905 return user, password 906 return HTTPPasswordMgr.find_user_password(self, None, authuri) 907 908 909class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 910 911 def __init__(self, *args, **kwargs): 912 self.authenticated = {} 913 super().__init__(*args, **kwargs) 914 915 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 916 self.update_authenticated(uri, is_authenticated) 917 # Add a default for prior auth requests 918 if realm is not None: 919 super().add_password(None, uri, user, passwd) 920 super().add_password(realm, uri, user, passwd) 921 922 def update_authenticated(self, uri, is_authenticated=False): 923 # uri could be a single URI or a sequence 924 if isinstance(uri, str): 925 uri = [uri] 926 927 for default_port in True, False: 928 for u in uri: 929 reduced_uri = self.reduce_uri(u, default_port) 930 self.authenticated[reduced_uri] = is_authenticated 931 932 def is_authenticated(self, authuri): 933 for default_port in True, False: 934 reduced_authuri = self.reduce_uri(authuri, default_port) 935 for uri in self.authenticated: 936 if self.is_suburi(uri, reduced_authuri): 937 return self.authenticated[uri] 938 939 940class AbstractBasicAuthHandler: 941 942 # XXX this allows for multiple auth-schemes, but will stupidly pick 943 # the last one with a realm specified. 944 945 # allow for double- and single-quoted realm values 946 # (single quotes are a violation of the RFC, but appear in the wild) 947 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 948 'realm=(["\']?)([^"\']*)\\2', re.I) 949 950 # XXX could pre-emptively send auth info already accepted (RFC 2617, 951 # end of section 2, and section 1.2 immediately after "credentials" 952 # production). 953 954 def __init__(self, password_mgr=None): 955 if password_mgr is None: 956 password_mgr = HTTPPasswordMgr() 957 self.passwd = password_mgr 958 self.add_password = self.passwd.add_password 959 960 def http_error_auth_reqed(self, authreq, host, req, headers): 961 # host may be an authority (without userinfo) or a URL with an 962 # authority 963 # XXX could be multiple headers 964 authreq = headers.get(authreq, None) 965 966 if authreq: 967 scheme = authreq.split()[0] 968 if scheme.lower() != 'basic': 969 raise ValueError("AbstractBasicAuthHandler does not" 970 " support the following scheme: '%s'" % 971 scheme) 972 else: 973 mo = AbstractBasicAuthHandler.rx.search(authreq) 974 if mo: 975 scheme, quote, realm = mo.groups() 976 if quote not in ['"',"'"]: 977 warnings.warn("Basic Auth Realm was unquoted", 978 UserWarning, 2) 979 if scheme.lower() == 'basic': 980 return self.retry_http_basic_auth(host, req, realm) 981 982 def retry_http_basic_auth(self, host, req, realm): 983 user, pw = self.passwd.find_user_password(realm, host) 984 if pw is not None: 985 raw = "%s:%s" % (user, pw) 986 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 987 if req.get_header(self.auth_header, None) == auth: 988 return None 989 req.add_unredirected_header(self.auth_header, auth) 990 return self.parent.open(req, timeout=req.timeout) 991 else: 992 return None 993 994 def http_request(self, req): 995 if (not hasattr(self.passwd, 'is_authenticated') or 996 not self.passwd.is_authenticated(req.full_url)): 997 return req 998 999 if not req.has_header('Authorization'): 1000 user, passwd = self.passwd.find_user_password(None, req.full_url) 1001 credentials = '{0}:{1}'.format(user, passwd).encode() 1002 auth_str = base64.standard_b64encode(credentials).decode() 1003 req.add_unredirected_header('Authorization', 1004 'Basic {}'.format(auth_str.strip())) 1005 return req 1006 1007 def http_response(self, req, response): 1008 if hasattr(self.passwd, 'is_authenticated'): 1009 if 200 <= response.code < 300: 1010 self.passwd.update_authenticated(req.full_url, True) 1011 else: 1012 self.passwd.update_authenticated(req.full_url, False) 1013 return response 1014 1015 https_request = http_request 1016 https_response = http_response 1017 1018 1019 1020class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1021 1022 auth_header = 'Authorization' 1023 1024 def http_error_401(self, req, fp, code, msg, headers): 1025 url = req.full_url 1026 response = self.http_error_auth_reqed('www-authenticate', 1027 url, req, headers) 1028 return response 1029 1030 1031class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1032 1033 auth_header = 'Proxy-authorization' 1034 1035 def http_error_407(self, req, fp, code, msg, headers): 1036 # http_error_auth_reqed requires that there is no userinfo component in 1037 # authority. Assume there isn't one, since urllib.request does not (and 1038 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1039 # userinfo. 1040 authority = req.host 1041 response = self.http_error_auth_reqed('proxy-authenticate', 1042 authority, req, headers) 1043 return response 1044 1045 1046# Return n random bytes. 1047_randombytes = os.urandom 1048 1049 1050class AbstractDigestAuthHandler: 1051 # Digest authentication is specified in RFC 2617. 1052 1053 # XXX The client does not inspect the Authentication-Info header 1054 # in a successful response. 1055 1056 # XXX It should be possible to test this implementation against 1057 # a mock server that just generates a static set of challenges. 1058 1059 # XXX qop="auth-int" supports is shaky 1060 1061 def __init__(self, passwd=None): 1062 if passwd is None: 1063 passwd = HTTPPasswordMgr() 1064 self.passwd = passwd 1065 self.add_password = self.passwd.add_password 1066 self.retried = 0 1067 self.nonce_count = 0 1068 self.last_nonce = None 1069 1070 def reset_retry_count(self): 1071 self.retried = 0 1072 1073 def http_error_auth_reqed(self, auth_header, host, req, headers): 1074 authreq = headers.get(auth_header, None) 1075 if self.retried > 5: 1076 # Don't fail endlessly - if we failed once, we'll probably 1077 # fail a second time. Hm. Unless the Password Manager is 1078 # prompting for the information. Crap. This isn't great 1079 # but it's better than the current 'repeat until recursion 1080 # depth exceeded' approach <wink> 1081 raise HTTPError(req.full_url, 401, "digest auth failed", 1082 headers, None) 1083 else: 1084 self.retried += 1 1085 if authreq: 1086 scheme = authreq.split()[0] 1087 if scheme.lower() == 'digest': 1088 return self.retry_http_digest_auth(req, authreq) 1089 elif scheme.lower() != 'basic': 1090 raise ValueError("AbstractDigestAuthHandler does not support" 1091 " the following scheme: '%s'" % scheme) 1092 1093 def retry_http_digest_auth(self, req, auth): 1094 token, challenge = auth.split(' ', 1) 1095 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1096 auth = self.get_authorization(req, chal) 1097 if auth: 1098 auth_val = 'Digest %s' % auth 1099 if req.headers.get(self.auth_header, None) == auth_val: 1100 return None 1101 req.add_unredirected_header(self.auth_header, auth_val) 1102 resp = self.parent.open(req, timeout=req.timeout) 1103 return resp 1104 1105 def get_cnonce(self, nonce): 1106 # The cnonce-value is an opaque 1107 # quoted string value provided by the client and used by both client 1108 # and server to avoid chosen plaintext attacks, to provide mutual 1109 # authentication, and to provide some message integrity protection. 1110 # This isn't a fabulous effort, but it's probably Good Enough. 1111 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1112 b = s.encode("ascii") + _randombytes(8) 1113 dig = hashlib.sha1(b).hexdigest() 1114 return dig[:16] 1115 1116 def get_authorization(self, req, chal): 1117 try: 1118 realm = chal['realm'] 1119 nonce = chal['nonce'] 1120 qop = chal.get('qop') 1121 algorithm = chal.get('algorithm', 'MD5') 1122 # mod_digest doesn't send an opaque, even though it isn't 1123 # supposed to be optional 1124 opaque = chal.get('opaque', None) 1125 except KeyError: 1126 return None 1127 1128 H, KD = self.get_algorithm_impls(algorithm) 1129 if H is None: 1130 return None 1131 1132 user, pw = self.passwd.find_user_password(realm, req.full_url) 1133 if user is None: 1134 return None 1135 1136 # XXX not implemented yet 1137 if req.data is not None: 1138 entdig = self.get_entity_digest(req.data, chal) 1139 else: 1140 entdig = None 1141 1142 A1 = "%s:%s:%s" % (user, realm, pw) 1143 A2 = "%s:%s" % (req.get_method(), 1144 # XXX selector: what about proxies and full urls 1145 req.selector) 1146 if qop == 'auth': 1147 if nonce == self.last_nonce: 1148 self.nonce_count += 1 1149 else: 1150 self.nonce_count = 1 1151 self.last_nonce = nonce 1152 ncvalue = '%08x' % self.nonce_count 1153 cnonce = self.get_cnonce(nonce) 1154 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 1155 respdig = KD(H(A1), noncebit) 1156 elif qop is None: 1157 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1158 else: 1159 # XXX handle auth-int. 1160 raise URLError("qop '%s' is not supported." % qop) 1161 1162 # XXX should the partial digests be encoded too? 1163 1164 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1165 'response="%s"' % (user, realm, nonce, req.selector, 1166 respdig) 1167 if opaque: 1168 base += ', opaque="%s"' % opaque 1169 if entdig: 1170 base += ', digest="%s"' % entdig 1171 base += ', algorithm="%s"' % algorithm 1172 if qop: 1173 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1174 return base 1175 1176 def get_algorithm_impls(self, algorithm): 1177 # lambdas assume digest modules are imported at the top level 1178 if algorithm == 'MD5': 1179 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1180 elif algorithm == 'SHA': 1181 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1182 # XXX MD5-sess 1183 else: 1184 raise ValueError("Unsupported digest authentication " 1185 "algorithm %r" % algorithm) 1186 KD = lambda s, d: H("%s:%s" % (s, d)) 1187 return H, KD 1188 1189 def get_entity_digest(self, data, chal): 1190 # XXX not implemented yet 1191 return None 1192 1193 1194class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1195 """An authentication protocol defined by RFC 2069 1196 1197 Digest authentication improves on basic authentication because it 1198 does not transmit passwords in the clear. 1199 """ 1200 1201 auth_header = 'Authorization' 1202 handler_order = 490 # before Basic auth 1203 1204 def http_error_401(self, req, fp, code, msg, headers): 1205 host = urlparse(req.full_url)[1] 1206 retry = self.http_error_auth_reqed('www-authenticate', 1207 host, req, headers) 1208 self.reset_retry_count() 1209 return retry 1210 1211 1212class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1213 1214 auth_header = 'Proxy-Authorization' 1215 handler_order = 490 # before Basic auth 1216 1217 def http_error_407(self, req, fp, code, msg, headers): 1218 host = req.host 1219 retry = self.http_error_auth_reqed('proxy-authenticate', 1220 host, req, headers) 1221 self.reset_retry_count() 1222 return retry 1223 1224class AbstractHTTPHandler(BaseHandler): 1225 1226 def __init__(self, debuglevel=0): 1227 self._debuglevel = debuglevel 1228 1229 def set_http_debuglevel(self, level): 1230 self._debuglevel = level 1231 1232 def _get_content_length(self, request): 1233 return http.client.HTTPConnection._get_content_length( 1234 request.data, 1235 request.get_method()) 1236 1237 def do_request_(self, request): 1238 host = request.host 1239 if not host: 1240 raise URLError('no host given') 1241 1242 if request.data is not None: # POST 1243 data = request.data 1244 if isinstance(data, str): 1245 msg = "POST data should be bytes, an iterable of bytes, " \ 1246 "or a file object. It cannot be of type str." 1247 raise TypeError(msg) 1248 if not request.has_header('Content-type'): 1249 request.add_unredirected_header( 1250 'Content-type', 1251 'application/x-www-form-urlencoded') 1252 if (not request.has_header('Content-length') 1253 and not request.has_header('Transfer-encoding')): 1254 content_length = self._get_content_length(request) 1255 if content_length is not None: 1256 request.add_unredirected_header( 1257 'Content-length', str(content_length)) 1258 else: 1259 request.add_unredirected_header( 1260 'Transfer-encoding', 'chunked') 1261 1262 sel_host = host 1263 if request.has_proxy(): 1264 scheme, sel = splittype(request.selector) 1265 sel_host, sel_path = splithost(sel) 1266 if not request.has_header('Host'): 1267 request.add_unredirected_header('Host', sel_host) 1268 for name, value in self.parent.addheaders: 1269 name = name.capitalize() 1270 if not request.has_header(name): 1271 request.add_unredirected_header(name, value) 1272 1273 return request 1274 1275 def do_open(self, http_class, req, **http_conn_args): 1276 """Return an HTTPResponse object for the request, using http_class. 1277 1278 http_class must implement the HTTPConnection API from http.client. 1279 """ 1280 host = req.host 1281 if not host: 1282 raise URLError('no host given') 1283 1284 # will parse host:port 1285 h = http_class(host, timeout=req.timeout, **http_conn_args) 1286 h.set_debuglevel(self._debuglevel) 1287 1288 headers = dict(req.unredirected_hdrs) 1289 headers.update({k: v for k, v in req.headers.items() 1290 if k not in headers}) 1291 1292 # TODO(jhylton): Should this be redesigned to handle 1293 # persistent connections? 1294 1295 # We want to make an HTTP/1.1 request, but the addinfourl 1296 # class isn't prepared to deal with a persistent connection. 1297 # It will try to read all remaining data from the socket, 1298 # which will block while the server waits for the next request. 1299 # So make sure the connection gets closed after the (only) 1300 # request. 1301 headers["Connection"] = "close" 1302 headers = {name.title(): val for name, val in headers.items()} 1303 1304 if req._tunnel_host: 1305 tunnel_headers = {} 1306 proxy_auth_hdr = "Proxy-Authorization" 1307 if proxy_auth_hdr in headers: 1308 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1309 # Proxy-Authorization should not be sent to origin 1310 # server. 1311 del headers[proxy_auth_hdr] 1312 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1313 1314 try: 1315 try: 1316 h.request(req.get_method(), req.selector, req.data, headers, 1317 encode_chunked=req.has_header('Transfer-encoding')) 1318 except OSError as err: # timeout error 1319 raise URLError(err) 1320 r = h.getresponse() 1321 except: 1322 h.close() 1323 raise 1324 1325 # If the server does not send us a 'Connection: close' header, 1326 # HTTPConnection assumes the socket should be left open. Manually 1327 # mark the socket to be closed when this response object goes away. 1328 if h.sock: 1329 h.sock.close() 1330 h.sock = None 1331 1332 r.url = req.get_full_url() 1333 # This line replaces the .msg attribute of the HTTPResponse 1334 # with .headers, because urllib clients expect the response to 1335 # have the reason in .msg. It would be good to mark this 1336 # attribute is deprecated and get then to use info() or 1337 # .headers. 1338 r.msg = r.reason 1339 return r 1340 1341 1342class HTTPHandler(AbstractHTTPHandler): 1343 1344 def http_open(self, req): 1345 return self.do_open(http.client.HTTPConnection, req) 1346 1347 http_request = AbstractHTTPHandler.do_request_ 1348 1349if hasattr(http.client, 'HTTPSConnection'): 1350 1351 class HTTPSHandler(AbstractHTTPHandler): 1352 1353 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1354 AbstractHTTPHandler.__init__(self, debuglevel) 1355 self._context = context 1356 self._check_hostname = check_hostname 1357 1358 def https_open(self, req): 1359 return self.do_open(http.client.HTTPSConnection, req, 1360 context=self._context, check_hostname=self._check_hostname) 1361 1362 https_request = AbstractHTTPHandler.do_request_ 1363 1364 __all__.append('HTTPSHandler') 1365 1366class HTTPCookieProcessor(BaseHandler): 1367 def __init__(self, cookiejar=None): 1368 import http.cookiejar 1369 if cookiejar is None: 1370 cookiejar = http.cookiejar.CookieJar() 1371 self.cookiejar = cookiejar 1372 1373 def http_request(self, request): 1374 self.cookiejar.add_cookie_header(request) 1375 return request 1376 1377 def http_response(self, request, response): 1378 self.cookiejar.extract_cookies(response, request) 1379 return response 1380 1381 https_request = http_request 1382 https_response = http_response 1383 1384class UnknownHandler(BaseHandler): 1385 def unknown_open(self, req): 1386 type = req.type 1387 raise URLError('unknown url type: %s' % type) 1388 1389def parse_keqv_list(l): 1390 """Parse list of key=value strings where keys are not duplicated.""" 1391 parsed = {} 1392 for elt in l: 1393 k, v = elt.split('=', 1) 1394 if v[0] == '"' and v[-1] == '"': 1395 v = v[1:-1] 1396 parsed[k] = v 1397 return parsed 1398 1399def parse_http_list(s): 1400 """Parse lists as described by RFC 2068 Section 2. 1401 1402 In particular, parse comma-separated lists where the elements of 1403 the list may include quoted-strings. A quoted-string could 1404 contain a comma. A non-quoted string could have quotes in the 1405 middle. Neither commas nor quotes count if they are escaped. 1406 Only double-quotes count, not single-quotes. 1407 """ 1408 res = [] 1409 part = '' 1410 1411 escape = quote = False 1412 for cur in s: 1413 if escape: 1414 part += cur 1415 escape = False 1416 continue 1417 if quote: 1418 if cur == '\\': 1419 escape = True 1420 continue 1421 elif cur == '"': 1422 quote = False 1423 part += cur 1424 continue 1425 1426 if cur == ',': 1427 res.append(part) 1428 part = '' 1429 continue 1430 1431 if cur == '"': 1432 quote = True 1433 1434 part += cur 1435 1436 # append last part 1437 if part: 1438 res.append(part) 1439 1440 return [part.strip() for part in res] 1441 1442class FileHandler(BaseHandler): 1443 # Use local file or FTP depending on form of URL 1444 def file_open(self, req): 1445 url = req.selector 1446 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1447 req.host != 'localhost'): 1448 if not req.host in self.get_names(): 1449 raise URLError("file:// scheme is supported only on localhost") 1450 else: 1451 return self.open_local_file(req) 1452 1453 # names for the localhost 1454 names = None 1455 def get_names(self): 1456 if FileHandler.names is None: 1457 try: 1458 FileHandler.names = tuple( 1459 socket.gethostbyname_ex('localhost')[2] + 1460 socket.gethostbyname_ex(socket.gethostname())[2]) 1461 except socket.gaierror: 1462 FileHandler.names = (socket.gethostbyname('localhost'),) 1463 return FileHandler.names 1464 1465 # not entirely sure what the rules are here 1466 def open_local_file(self, req): 1467 import email.utils 1468 import mimetypes 1469 host = req.host 1470 filename = req.selector 1471 localfile = url2pathname(filename) 1472 try: 1473 stats = os.stat(localfile) 1474 size = stats.st_size 1475 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1476 mtype = mimetypes.guess_type(filename)[0] 1477 headers = email.message_from_string( 1478 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1479 (mtype or 'text/plain', size, modified)) 1480 if host: 1481 host, port = splitport(host) 1482 if not host or \ 1483 (not port and _safe_gethostbyname(host) in self.get_names()): 1484 if host: 1485 origurl = 'file://' + host + filename 1486 else: 1487 origurl = 'file://' + filename 1488 return addinfourl(open(localfile, 'rb'), headers, origurl) 1489 except OSError as exp: 1490 raise URLError(exp) 1491 raise URLError('file not on local host') 1492 1493def _safe_gethostbyname(host): 1494 try: 1495 return socket.gethostbyname(host) 1496 except socket.gaierror: 1497 return None 1498 1499class FTPHandler(BaseHandler): 1500 def ftp_open(self, req): 1501 import ftplib 1502 import mimetypes 1503 host = req.host 1504 if not host: 1505 raise URLError('ftp error: no host given') 1506 host, port = splitport(host) 1507 if port is None: 1508 port = ftplib.FTP_PORT 1509 else: 1510 port = int(port) 1511 1512 # username/password handling 1513 user, host = splituser(host) 1514 if user: 1515 user, passwd = splitpasswd(user) 1516 else: 1517 passwd = None 1518 host = unquote(host) 1519 user = user or '' 1520 passwd = passwd or '' 1521 1522 try: 1523 host = socket.gethostbyname(host) 1524 except OSError as msg: 1525 raise URLError(msg) 1526 path, attrs = splitattr(req.selector) 1527 dirs = path.split('/') 1528 dirs = list(map(unquote, dirs)) 1529 dirs, file = dirs[:-1], dirs[-1] 1530 if dirs and not dirs[0]: 1531 dirs = dirs[1:] 1532 try: 1533 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1534 type = file and 'I' or 'D' 1535 for attr in attrs: 1536 attr, value = splitvalue(attr) 1537 if attr.lower() == 'type' and \ 1538 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1539 type = value.upper() 1540 fp, retrlen = fw.retrfile(file, type) 1541 headers = "" 1542 mtype = mimetypes.guess_type(req.full_url)[0] 1543 if mtype: 1544 headers += "Content-type: %s\n" % mtype 1545 if retrlen is not None and retrlen >= 0: 1546 headers += "Content-length: %d\n" % retrlen 1547 headers = email.message_from_string(headers) 1548 return addinfourl(fp, headers, req.full_url) 1549 except ftplib.all_errors as exp: 1550 exc = URLError('ftp error: %r' % exp) 1551 raise exc.with_traceback(sys.exc_info()[2]) 1552 1553 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1554 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1555 persistent=False) 1556 1557class CacheFTPHandler(FTPHandler): 1558 # XXX would be nice to have pluggable cache strategies 1559 # XXX this stuff is definitely not thread safe 1560 def __init__(self): 1561 self.cache = {} 1562 self.timeout = {} 1563 self.soonest = 0 1564 self.delay = 60 1565 self.max_conns = 16 1566 1567 def setTimeout(self, t): 1568 self.delay = t 1569 1570 def setMaxConns(self, m): 1571 self.max_conns = m 1572 1573 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1574 key = user, host, port, '/'.join(dirs), timeout 1575 if key in self.cache: 1576 self.timeout[key] = time.time() + self.delay 1577 else: 1578 self.cache[key] = ftpwrapper(user, passwd, host, port, 1579 dirs, timeout) 1580 self.timeout[key] = time.time() + self.delay 1581 self.check_cache() 1582 return self.cache[key] 1583 1584 def check_cache(self): 1585 # first check for old ones 1586 t = time.time() 1587 if self.soonest <= t: 1588 for k, v in list(self.timeout.items()): 1589 if v < t: 1590 self.cache[k].close() 1591 del self.cache[k] 1592 del self.timeout[k] 1593 self.soonest = min(list(self.timeout.values())) 1594 1595 # then check the size 1596 if len(self.cache) == self.max_conns: 1597 for k, v in list(self.timeout.items()): 1598 if v == self.soonest: 1599 del self.cache[k] 1600 del self.timeout[k] 1601 break 1602 self.soonest = min(list(self.timeout.values())) 1603 1604 def clear_cache(self): 1605 for conn in self.cache.values(): 1606 conn.close() 1607 self.cache.clear() 1608 self.timeout.clear() 1609 1610class DataHandler(BaseHandler): 1611 def data_open(self, req): 1612 # data URLs as specified in RFC 2397. 1613 # 1614 # ignores POSTed data 1615 # 1616 # syntax: 1617 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1618 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1619 # data := *urlchar 1620 # parameter := attribute "=" value 1621 url = req.full_url 1622 1623 scheme, data = url.split(":",1) 1624 mediatype, data = data.split(",",1) 1625 1626 # even base64 encoded data URLs might be quoted so unquote in any case: 1627 data = unquote_to_bytes(data) 1628 if mediatype.endswith(";base64"): 1629 data = base64.decodebytes(data) 1630 mediatype = mediatype[:-7] 1631 1632 if not mediatype: 1633 mediatype = "text/plain;charset=US-ASCII" 1634 1635 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1636 (mediatype, len(data))) 1637 1638 return addinfourl(io.BytesIO(data), headers, url) 1639 1640 1641# Code move from the old urllib module 1642 1643MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1644 1645# Helper for non-unix systems 1646if os.name == 'nt': 1647 from nturl2path import url2pathname, pathname2url 1648else: 1649 def url2pathname(pathname): 1650 """OS-specific conversion from a relative URL of the 'file' scheme 1651 to a file system path; not recommended for general use.""" 1652 return unquote(pathname) 1653 1654 def pathname2url(pathname): 1655 """OS-specific conversion from a file system path to a relative URL 1656 of the 'file' scheme; not recommended for general use.""" 1657 return quote(pathname) 1658 1659 1660ftpcache = {} 1661 1662 1663class URLopener: 1664 """Class to open URLs. 1665 This is a class rather than just a subroutine because we may need 1666 more than one set of global protocol-specific options. 1667 Note -- this is a base class for those who don't want the 1668 automatic handling of errors type 302 (relocated) and 401 1669 (authorization needed).""" 1670 1671 __tempfiles = None 1672 1673 version = "Python-urllib/%s" % __version__ 1674 1675 # Constructor 1676 def __init__(self, proxies=None, **x509): 1677 msg = "%(class)s style of invoking requests is deprecated. " \ 1678 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1679 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1680 if proxies is None: 1681 proxies = getproxies() 1682 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1683 self.proxies = proxies 1684 self.key_file = x509.get('key_file') 1685 self.cert_file = x509.get('cert_file') 1686 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1687 self.__tempfiles = [] 1688 self.__unlink = os.unlink # See cleanup() 1689 self.tempcache = None 1690 # Undocumented feature: if you assign {} to tempcache, 1691 # it is used to cache files retrieved with 1692 # self.retrieve(). This is not enabled by default 1693 # since it does not work for changing documents (and I 1694 # haven't got the logic to check expiration headers 1695 # yet). 1696 self.ftpcache = ftpcache 1697 # Undocumented feature: you can use a different 1698 # ftp cache by assigning to the .ftpcache member; 1699 # in case you want logically independent URL openers 1700 # XXX This is not threadsafe. Bah. 1701 1702 def __del__(self): 1703 self.close() 1704 1705 def close(self): 1706 self.cleanup() 1707 1708 def cleanup(self): 1709 # This code sometimes runs when the rest of this module 1710 # has already been deleted, so it can't use any globals 1711 # or import anything. 1712 if self.__tempfiles: 1713 for file in self.__tempfiles: 1714 try: 1715 self.__unlink(file) 1716 except OSError: 1717 pass 1718 del self.__tempfiles[:] 1719 if self.tempcache: 1720 self.tempcache.clear() 1721 1722 def addheader(self, *args): 1723 """Add a header to be used by the HTTP interface only 1724 e.g. u.addheader('Accept', 'sound/basic')""" 1725 self.addheaders.append(args) 1726 1727 # External interface 1728 def open(self, fullurl, data=None): 1729 """Use URLopener().open(file) instead of open(file, 'r').""" 1730 fullurl = unwrap(to_bytes(fullurl)) 1731 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1732 if self.tempcache and fullurl in self.tempcache: 1733 filename, headers = self.tempcache[fullurl] 1734 fp = open(filename, 'rb') 1735 return addinfourl(fp, headers, fullurl) 1736 urltype, url = splittype(fullurl) 1737 if not urltype: 1738 urltype = 'file' 1739 if urltype in self.proxies: 1740 proxy = self.proxies[urltype] 1741 urltype, proxyhost = splittype(proxy) 1742 host, selector = splithost(proxyhost) 1743 url = (host, fullurl) # Signal special case to open_*() 1744 else: 1745 proxy = None 1746 name = 'open_' + urltype 1747 self.type = urltype 1748 name = name.replace('-', '_') 1749 if not hasattr(self, name): 1750 if proxy: 1751 return self.open_unknown_proxy(proxy, fullurl, data) 1752 else: 1753 return self.open_unknown(fullurl, data) 1754 try: 1755 if data is None: 1756 return getattr(self, name)(url) 1757 else: 1758 return getattr(self, name)(url, data) 1759 except (HTTPError, URLError): 1760 raise 1761 except OSError as msg: 1762 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2]) 1763 1764 def open_unknown(self, fullurl, data=None): 1765 """Overridable interface to open unknown URL type.""" 1766 type, url = splittype(fullurl) 1767 raise OSError('url error', 'unknown url type', type) 1768 1769 def open_unknown_proxy(self, proxy, fullurl, data=None): 1770 """Overridable interface to open unknown URL type.""" 1771 type, url = splittype(fullurl) 1772 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1773 1774 # External interface 1775 def retrieve(self, url, filename=None, reporthook=None, data=None): 1776 """retrieve(url) returns (filename, headers) for a local object 1777 or (tempfilename, headers) for a remote object.""" 1778 url = unwrap(to_bytes(url)) 1779 if self.tempcache and url in self.tempcache: 1780 return self.tempcache[url] 1781 type, url1 = splittype(url) 1782 if filename is None and (not type or type == 'file'): 1783 try: 1784 fp = self.open_local_file(url1) 1785 hdrs = fp.info() 1786 fp.close() 1787 return url2pathname(splithost(url1)[1]), hdrs 1788 except OSError as msg: 1789 pass 1790 fp = self.open(url, data) 1791 try: 1792 headers = fp.info() 1793 if filename: 1794 tfp = open(filename, 'wb') 1795 else: 1796 garbage, path = splittype(url) 1797 garbage, path = splithost(path or "") 1798 path, garbage = splitquery(path or "") 1799 path, garbage = splitattr(path or "") 1800 suffix = os.path.splitext(path)[1] 1801 (fd, filename) = tempfile.mkstemp(suffix) 1802 self.__tempfiles.append(filename) 1803 tfp = os.fdopen(fd, 'wb') 1804 try: 1805 result = filename, headers 1806 if self.tempcache is not None: 1807 self.tempcache[url] = result 1808 bs = 1024*8 1809 size = -1 1810 read = 0 1811 blocknum = 0 1812 if "content-length" in headers: 1813 size = int(headers["Content-Length"]) 1814 if reporthook: 1815 reporthook(blocknum, bs, size) 1816 while 1: 1817 block = fp.read(bs) 1818 if not block: 1819 break 1820 read += len(block) 1821 tfp.write(block) 1822 blocknum += 1 1823 if reporthook: 1824 reporthook(blocknum, bs, size) 1825 finally: 1826 tfp.close() 1827 finally: 1828 fp.close() 1829 1830 # raise exception if actual size does not match content-length header 1831 if size >= 0 and read < size: 1832 raise ContentTooShortError( 1833 "retrieval incomplete: got only %i out of %i bytes" 1834 % (read, size), result) 1835 1836 return result 1837 1838 # Each method named open_<type> knows how to open that type of URL 1839 1840 def _open_generic_http(self, connection_factory, url, data): 1841 """Make an HTTP connection using connection_class. 1842 1843 This is an internal method that should be called from 1844 open_http() or open_https(). 1845 1846 Arguments: 1847 - connection_factory should take a host name and return an 1848 HTTPConnection instance. 1849 - url is the url to retrieval or a host, relative-path pair. 1850 - data is payload for a POST request or None. 1851 """ 1852 1853 user_passwd = None 1854 proxy_passwd= None 1855 if isinstance(url, str): 1856 host, selector = splithost(url) 1857 if host: 1858 user_passwd, host = splituser(host) 1859 host = unquote(host) 1860 realhost = host 1861 else: 1862 host, selector = url 1863 # check whether the proxy contains authorization information 1864 proxy_passwd, host = splituser(host) 1865 # now we proceed with the url we want to obtain 1866 urltype, rest = splittype(selector) 1867 url = rest 1868 user_passwd = None 1869 if urltype.lower() != 'http': 1870 realhost = None 1871 else: 1872 realhost, rest = splithost(rest) 1873 if realhost: 1874 user_passwd, realhost = splituser(realhost) 1875 if user_passwd: 1876 selector = "%s://%s%s" % (urltype, realhost, rest) 1877 if proxy_bypass(realhost): 1878 host = realhost 1879 1880 if not host: raise OSError('http error', 'no host given') 1881 1882 if proxy_passwd: 1883 proxy_passwd = unquote(proxy_passwd) 1884 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1885 else: 1886 proxy_auth = None 1887 1888 if user_passwd: 1889 user_passwd = unquote(user_passwd) 1890 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1891 else: 1892 auth = None 1893 http_conn = connection_factory(host) 1894 headers = {} 1895 if proxy_auth: 1896 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1897 if auth: 1898 headers["Authorization"] = "Basic %s" % auth 1899 if realhost: 1900 headers["Host"] = realhost 1901 1902 # Add Connection:close as we don't support persistent connections yet. 1903 # This helps in closing the socket and avoiding ResourceWarning 1904 1905 headers["Connection"] = "close" 1906 1907 for header, value in self.addheaders: 1908 headers[header] = value 1909 1910 if data is not None: 1911 headers["Content-Type"] = "application/x-www-form-urlencoded" 1912 http_conn.request("POST", selector, data, headers) 1913 else: 1914 http_conn.request("GET", selector, headers=headers) 1915 1916 try: 1917 response = http_conn.getresponse() 1918 except http.client.BadStatusLine: 1919 # something went wrong with the HTTP status line 1920 raise URLError("http protocol error: bad status line") 1921 1922 # According to RFC 2616, "2xx" code indicates that the client's 1923 # request was successfully received, understood, and accepted. 1924 if 200 <= response.status < 300: 1925 return addinfourl(response, response.msg, "http:" + url, 1926 response.status) 1927 else: 1928 return self.http_error( 1929 url, response.fp, 1930 response.status, response.reason, response.msg, data) 1931 1932 def open_http(self, url, data=None): 1933 """Use HTTP protocol.""" 1934 return self._open_generic_http(http.client.HTTPConnection, url, data) 1935 1936 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1937 """Handle http errors. 1938 1939 Derived class can override this, or provide specific handlers 1940 named http_error_DDD where DDD is the 3-digit error code.""" 1941 # First check if there's a specific handler for this error 1942 name = 'http_error_%d' % errcode 1943 if hasattr(self, name): 1944 method = getattr(self, name) 1945 if data is None: 1946 result = method(url, fp, errcode, errmsg, headers) 1947 else: 1948 result = method(url, fp, errcode, errmsg, headers, data) 1949 if result: return result 1950 return self.http_error_default(url, fp, errcode, errmsg, headers) 1951 1952 def http_error_default(self, url, fp, errcode, errmsg, headers): 1953 """Default error handler: close the connection and raise OSError.""" 1954 fp.close() 1955 raise HTTPError(url, errcode, errmsg, headers, None) 1956 1957 if _have_ssl: 1958 def _https_connection(self, host): 1959 return http.client.HTTPSConnection(host, 1960 key_file=self.key_file, 1961 cert_file=self.cert_file) 1962 1963 def open_https(self, url, data=None): 1964 """Use HTTPS protocol.""" 1965 return self._open_generic_http(self._https_connection, url, data) 1966 1967 def open_file(self, url): 1968 """Use local file or FTP depending on form of URL.""" 1969 if not isinstance(url, str): 1970 raise URLError('file error: proxy support for file protocol currently not implemented') 1971 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 1972 raise ValueError("file:// scheme is supported only on localhost") 1973 else: 1974 return self.open_local_file(url) 1975 1976 def open_local_file(self, url): 1977 """Use local file.""" 1978 import email.utils 1979 import mimetypes 1980 host, file = splithost(url) 1981 localname = url2pathname(file) 1982 try: 1983 stats = os.stat(localname) 1984 except OSError as e: 1985 raise URLError(e.strerror, e.filename) 1986 size = stats.st_size 1987 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1988 mtype = mimetypes.guess_type(url)[0] 1989 headers = email.message_from_string( 1990 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 1991 (mtype or 'text/plain', size, modified)) 1992 if not host: 1993 urlfile = file 1994 if file[:1] == '/': 1995 urlfile = 'file://' + file 1996 return addinfourl(open(localname, 'rb'), headers, urlfile) 1997 host, port = splitport(host) 1998 if (not port 1999 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2000 urlfile = file 2001 if file[:1] == '/': 2002 urlfile = 'file://' + file 2003 elif file[:2] == './': 2004 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2005 return addinfourl(open(localname, 'rb'), headers, urlfile) 2006 raise URLError('local file error: not on local host') 2007 2008 def open_ftp(self, url): 2009 """Use FTP protocol.""" 2010 if not isinstance(url, str): 2011 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2012 import mimetypes 2013 host, path = splithost(url) 2014 if not host: raise URLError('ftp error: no host given') 2015 host, port = splitport(host) 2016 user, host = splituser(host) 2017 if user: user, passwd = splitpasswd(user) 2018 else: passwd = None 2019 host = unquote(host) 2020 user = unquote(user or '') 2021 passwd = unquote(passwd or '') 2022 host = socket.gethostbyname(host) 2023 if not port: 2024 import ftplib 2025 port = ftplib.FTP_PORT 2026 else: 2027 port = int(port) 2028 path, attrs = splitattr(path) 2029 path = unquote(path) 2030 dirs = path.split('/') 2031 dirs, file = dirs[:-1], dirs[-1] 2032 if dirs and not dirs[0]: dirs = dirs[1:] 2033 if dirs and not dirs[0]: dirs[0] = '/' 2034 key = user, host, port, '/'.join(dirs) 2035 # XXX thread unsafe! 2036 if len(self.ftpcache) > MAXFTPCACHE: 2037 # Prune the cache, rather arbitrarily 2038 for k in list(self.ftpcache): 2039 if k != key: 2040 v = self.ftpcache[k] 2041 del self.ftpcache[k] 2042 v.close() 2043 try: 2044 if key not in self.ftpcache: 2045 self.ftpcache[key] = \ 2046 ftpwrapper(user, passwd, host, port, dirs) 2047 if not file: type = 'D' 2048 else: type = 'I' 2049 for attr in attrs: 2050 attr, value = splitvalue(attr) 2051 if attr.lower() == 'type' and \ 2052 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2053 type = value.upper() 2054 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2055 mtype = mimetypes.guess_type("ftp:" + url)[0] 2056 headers = "" 2057 if mtype: 2058 headers += "Content-Type: %s\n" % mtype 2059 if retrlen is not None and retrlen >= 0: 2060 headers += "Content-Length: %d\n" % retrlen 2061 headers = email.message_from_string(headers) 2062 return addinfourl(fp, headers, "ftp:" + url) 2063 except ftperrors() as exp: 2064 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2]) 2065 2066 def open_data(self, url, data=None): 2067 """Use "data" URL.""" 2068 if not isinstance(url, str): 2069 raise URLError('data error: proxy support for data protocol currently not implemented') 2070 # ignore POSTed data 2071 # 2072 # syntax of data URLs: 2073 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2074 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2075 # data := *urlchar 2076 # parameter := attribute "=" value 2077 try: 2078 [type, data] = url.split(',', 1) 2079 except ValueError: 2080 raise OSError('data error', 'bad data URL') 2081 if not type: 2082 type = 'text/plain;charset=US-ASCII' 2083 semi = type.rfind(';') 2084 if semi >= 0 and '=' not in type[semi:]: 2085 encoding = type[semi+1:] 2086 type = type[:semi] 2087 else: 2088 encoding = '' 2089 msg = [] 2090 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2091 time.gmtime(time.time()))) 2092 msg.append('Content-type: %s' % type) 2093 if encoding == 'base64': 2094 # XXX is this encoding/decoding ok? 2095 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2096 else: 2097 data = unquote(data) 2098 msg.append('Content-Length: %d' % len(data)) 2099 msg.append('') 2100 msg.append(data) 2101 msg = '\n'.join(msg) 2102 headers = email.message_from_string(msg) 2103 f = io.StringIO(msg) 2104 #f.fileno = None # needed for addinfourl 2105 return addinfourl(f, headers, url) 2106 2107 2108class FancyURLopener(URLopener): 2109 """Derived class with handlers for errors we can handle (perhaps).""" 2110 2111 def __init__(self, *args, **kwargs): 2112 URLopener.__init__(self, *args, **kwargs) 2113 self.auth_cache = {} 2114 self.tries = 0 2115 self.maxtries = 10 2116 2117 def http_error_default(self, url, fp, errcode, errmsg, headers): 2118 """Default error handling -- don't raise an exception.""" 2119 return addinfourl(fp, headers, "http:" + url, errcode) 2120 2121 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2122 """Error 302 -- relocated (temporarily).""" 2123 self.tries += 1 2124 try: 2125 if self.maxtries and self.tries >= self.maxtries: 2126 if hasattr(self, "http_error_500"): 2127 meth = self.http_error_500 2128 else: 2129 meth = self.http_error_default 2130 return meth(url, fp, 500, 2131 "Internal Server Error: Redirect Recursion", 2132 headers) 2133 result = self.redirect_internal(url, fp, errcode, errmsg, 2134 headers, data) 2135 return result 2136 finally: 2137 self.tries = 0 2138 2139 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2140 if 'location' in headers: 2141 newurl = headers['location'] 2142 elif 'uri' in headers: 2143 newurl = headers['uri'] 2144 else: 2145 return 2146 fp.close() 2147 2148 # In case the server sent a relative URL, join with original: 2149 newurl = urljoin(self.type + ":" + url, newurl) 2150 2151 urlparts = urlparse(newurl) 2152 2153 # For security reasons, we don't allow redirection to anything other 2154 # than http, https and ftp. 2155 2156 # We are using newer HTTPError with older redirect_internal method 2157 # This older method will get deprecated in 3.3 2158 2159 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2160 raise HTTPError(newurl, errcode, 2161 errmsg + 2162 " Redirection to url '%s' is not allowed." % newurl, 2163 headers, fp) 2164 2165 return self.open(newurl) 2166 2167 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2168 """Error 301 -- also relocated (permanently).""" 2169 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2170 2171 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2172 """Error 303 -- also relocated (essentially identical to 302).""" 2173 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2174 2175 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2176 """Error 307 -- relocated, but turn POST into error.""" 2177 if data is None: 2178 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2179 else: 2180 return self.http_error_default(url, fp, errcode, errmsg, headers) 2181 2182 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2183 retry=False): 2184 """Error 401 -- authentication required. 2185 This function supports Basic authentication only.""" 2186 if 'www-authenticate' not in headers: 2187 URLopener.http_error_default(self, url, fp, 2188 errcode, errmsg, headers) 2189 stuff = headers['www-authenticate'] 2190 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2191 if not match: 2192 URLopener.http_error_default(self, url, fp, 2193 errcode, errmsg, headers) 2194 scheme, realm = match.groups() 2195 if scheme.lower() != 'basic': 2196 URLopener.http_error_default(self, url, fp, 2197 errcode, errmsg, headers) 2198 if not retry: 2199 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2200 headers) 2201 name = 'retry_' + self.type + '_basic_auth' 2202 if data is None: 2203 return getattr(self,name)(url, realm) 2204 else: 2205 return getattr(self,name)(url, realm, data) 2206 2207 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2208 retry=False): 2209 """Error 407 -- proxy authentication required. 2210 This function supports Basic authentication only.""" 2211 if 'proxy-authenticate' not in headers: 2212 URLopener.http_error_default(self, url, fp, 2213 errcode, errmsg, headers) 2214 stuff = headers['proxy-authenticate'] 2215 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2216 if not match: 2217 URLopener.http_error_default(self, url, fp, 2218 errcode, errmsg, headers) 2219 scheme, realm = match.groups() 2220 if scheme.lower() != 'basic': 2221 URLopener.http_error_default(self, url, fp, 2222 errcode, errmsg, headers) 2223 if not retry: 2224 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2225 headers) 2226 name = 'retry_proxy_' + self.type + '_basic_auth' 2227 if data is None: 2228 return getattr(self,name)(url, realm) 2229 else: 2230 return getattr(self,name)(url, realm, data) 2231 2232 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2233 host, selector = splithost(url) 2234 newurl = 'http://' + host + selector 2235 proxy = self.proxies['http'] 2236 urltype, proxyhost = splittype(proxy) 2237 proxyhost, proxyselector = splithost(proxyhost) 2238 i = proxyhost.find('@') + 1 2239 proxyhost = proxyhost[i:] 2240 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2241 if not (user or passwd): return None 2242 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2243 quote(passwd, safe=''), proxyhost) 2244 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2245 if data is None: 2246 return self.open(newurl) 2247 else: 2248 return self.open(newurl, data) 2249 2250 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2251 host, selector = splithost(url) 2252 newurl = 'https://' + host + selector 2253 proxy = self.proxies['https'] 2254 urltype, proxyhost = splittype(proxy) 2255 proxyhost, proxyselector = splithost(proxyhost) 2256 i = proxyhost.find('@') + 1 2257 proxyhost = proxyhost[i:] 2258 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2259 if not (user or passwd): return None 2260 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2261 quote(passwd, safe=''), proxyhost) 2262 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2263 if data is None: 2264 return self.open(newurl) 2265 else: 2266 return self.open(newurl, data) 2267 2268 def retry_http_basic_auth(self, url, realm, data=None): 2269 host, selector = splithost(url) 2270 i = host.find('@') + 1 2271 host = host[i:] 2272 user, passwd = self.get_user_passwd(host, realm, i) 2273 if not (user or passwd): return None 2274 host = "%s:%s@%s" % (quote(user, safe=''), 2275 quote(passwd, safe=''), host) 2276 newurl = 'http://' + host + selector 2277 if data is None: 2278 return self.open(newurl) 2279 else: 2280 return self.open(newurl, data) 2281 2282 def retry_https_basic_auth(self, url, realm, data=None): 2283 host, selector = splithost(url) 2284 i = host.find('@') + 1 2285 host = host[i:] 2286 user, passwd = self.get_user_passwd(host, realm, i) 2287 if not (user or passwd): return None 2288 host = "%s:%s@%s" % (quote(user, safe=''), 2289 quote(passwd, safe=''), host) 2290 newurl = 'https://' + host + selector 2291 if data is None: 2292 return self.open(newurl) 2293 else: 2294 return self.open(newurl, data) 2295 2296 def get_user_passwd(self, host, realm, clear_cache=0): 2297 key = realm + '@' + host.lower() 2298 if key in self.auth_cache: 2299 if clear_cache: 2300 del self.auth_cache[key] 2301 else: 2302 return self.auth_cache[key] 2303 user, passwd = self.prompt_user_passwd(host, realm) 2304 if user or passwd: self.auth_cache[key] = (user, passwd) 2305 return user, passwd 2306 2307 def prompt_user_passwd(self, host, realm): 2308 """Override this in a GUI environment!""" 2309 import getpass 2310 try: 2311 user = input("Enter username for %s at %s: " % (realm, host)) 2312 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2313 (user, realm, host)) 2314 return user, passwd 2315 except KeyboardInterrupt: 2316 print() 2317 return None, None 2318 2319 2320# Utility functions 2321 2322_localhost = None 2323def localhost(): 2324 """Return the IP address of the magic hostname 'localhost'.""" 2325 global _localhost 2326 if _localhost is None: 2327 _localhost = socket.gethostbyname('localhost') 2328 return _localhost 2329 2330_thishost = None 2331def thishost(): 2332 """Return the IP addresses of the current host.""" 2333 global _thishost 2334 if _thishost is None: 2335 try: 2336 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2337 except socket.gaierror: 2338 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2339 return _thishost 2340 2341_ftperrors = None 2342def ftperrors(): 2343 """Return the set of errors raised by the FTP class.""" 2344 global _ftperrors 2345 if _ftperrors is None: 2346 import ftplib 2347 _ftperrors = ftplib.all_errors 2348 return _ftperrors 2349 2350_noheaders = None 2351def noheaders(): 2352 """Return an empty email Message object.""" 2353 global _noheaders 2354 if _noheaders is None: 2355 _noheaders = email.message_from_string("") 2356 return _noheaders 2357 2358 2359# Utility classes 2360 2361class ftpwrapper: 2362 """Class used by open_ftp() for cache of open FTP connections.""" 2363 2364 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2365 persistent=True): 2366 self.user = user 2367 self.passwd = passwd 2368 self.host = host 2369 self.port = port 2370 self.dirs = dirs 2371 self.timeout = timeout 2372 self.refcount = 0 2373 self.keepalive = persistent 2374 try: 2375 self.init() 2376 except: 2377 self.close() 2378 raise 2379 2380 def init(self): 2381 import ftplib 2382 self.busy = 0 2383 self.ftp = ftplib.FTP() 2384 self.ftp.connect(self.host, self.port, self.timeout) 2385 self.ftp.login(self.user, self.passwd) 2386 _target = '/'.join(self.dirs) 2387 self.ftp.cwd(_target) 2388 2389 def retrfile(self, file, type): 2390 import ftplib 2391 self.endtransfer() 2392 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2393 else: cmd = 'TYPE ' + type; isdir = 0 2394 try: 2395 self.ftp.voidcmd(cmd) 2396 except ftplib.all_errors: 2397 self.init() 2398 self.ftp.voidcmd(cmd) 2399 conn = None 2400 if file and not isdir: 2401 # Try to retrieve as a file 2402 try: 2403 cmd = 'RETR ' + file 2404 conn, retrlen = self.ftp.ntransfercmd(cmd) 2405 except ftplib.error_perm as reason: 2406 if str(reason)[:3] != '550': 2407 raise URLError('ftp error: %r' % reason).with_traceback( 2408 sys.exc_info()[2]) 2409 if not conn: 2410 # Set transfer mode to ASCII! 2411 self.ftp.voidcmd('TYPE A') 2412 # Try a directory listing. Verify that directory exists. 2413 if file: 2414 pwd = self.ftp.pwd() 2415 try: 2416 try: 2417 self.ftp.cwd(file) 2418 except ftplib.error_perm as reason: 2419 raise URLError('ftp error: %r' % reason) from reason 2420 finally: 2421 self.ftp.cwd(pwd) 2422 cmd = 'LIST ' + file 2423 else: 2424 cmd = 'LIST' 2425 conn, retrlen = self.ftp.ntransfercmd(cmd) 2426 self.busy = 1 2427 2428 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2429 self.refcount += 1 2430 conn.close() 2431 # Pass back both a suitably decorated object and a retrieval length 2432 return (ftpobj, retrlen) 2433 2434 def endtransfer(self): 2435 self.busy = 0 2436 2437 def close(self): 2438 self.keepalive = False 2439 if self.refcount <= 0: 2440 self.real_close() 2441 2442 def file_close(self): 2443 self.endtransfer() 2444 self.refcount -= 1 2445 if self.refcount <= 0 and not self.keepalive: 2446 self.real_close() 2447 2448 def real_close(self): 2449 self.endtransfer() 2450 try: 2451 self.ftp.close() 2452 except ftperrors(): 2453 pass 2454 2455# Proxy handling 2456def getproxies_environment(): 2457 """Return a dictionary of scheme -> proxy server URL mappings. 2458 2459 Scan the environment for variables named <scheme>_proxy; 2460 this seems to be the standard convention. If you need a 2461 different way, you can pass a proxies dictionary to the 2462 [Fancy]URLopener constructor. 2463 2464 """ 2465 proxies = {} 2466 # in order to prefer lowercase variables, process environment in 2467 # two passes: first matches any, second pass matches lowercase only 2468 for name, value in os.environ.items(): 2469 name = name.lower() 2470 if value and name[-6:] == '_proxy': 2471 proxies[name[:-6]] = value 2472 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2473 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2474 # header from the client 2475 # If "proxy" is lowercase, it will still be used thanks to the next block 2476 if 'REQUEST_METHOD' in os.environ: 2477 proxies.pop('http', None) 2478 for name, value in os.environ.items(): 2479 if name[-6:] == '_proxy': 2480 name = name.lower() 2481 if value: 2482 proxies[name[:-6]] = value 2483 else: 2484 proxies.pop(name[:-6], None) 2485 return proxies 2486 2487def proxy_bypass_environment(host, proxies=None): 2488 """Test if proxies should not be used for a particular host. 2489 2490 Checks the proxy dict for the value of no_proxy, which should 2491 be a list of comma separated DNS suffixes, or '*' for all hosts. 2492 2493 """ 2494 if proxies is None: 2495 proxies = getproxies_environment() 2496 # don't bypass, if no_proxy isn't specified 2497 try: 2498 no_proxy = proxies['no'] 2499 except KeyError: 2500 return 0 2501 # '*' is special case for always bypass 2502 if no_proxy == '*': 2503 return 1 2504 # strip port off host 2505 hostonly, port = splitport(host) 2506 # check if the host ends with any of the DNS suffixes 2507 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')] 2508 for name in no_proxy_list: 2509 if name: 2510 name = name.lstrip('.') # ignore leading dots 2511 name = re.escape(name) 2512 pattern = r'(.+\.)?%s$' % name 2513 if (re.match(pattern, hostonly, re.I) 2514 or re.match(pattern, host, re.I)): 2515 return 1 2516 # otherwise, don't bypass 2517 return 0 2518 2519 2520# This code tests an OSX specific data structure but is testable on all 2521# platforms 2522def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2523 """ 2524 Return True iff this host shouldn't be accessed using a proxy 2525 2526 This function uses the MacOSX framework SystemConfiguration 2527 to fetch the proxy information. 2528 2529 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2530 { 'exclude_simple': bool, 2531 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2532 } 2533 """ 2534 from fnmatch import fnmatch 2535 2536 hostonly, port = splitport(host) 2537 2538 def ip2num(ipAddr): 2539 parts = ipAddr.split('.') 2540 parts = list(map(int, parts)) 2541 if len(parts) != 4: 2542 parts = (parts + [0, 0, 0, 0])[:4] 2543 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2544 2545 # Check for simple host names: 2546 if '.' not in host: 2547 if proxy_settings['exclude_simple']: 2548 return True 2549 2550 hostIP = None 2551 2552 for value in proxy_settings.get('exceptions', ()): 2553 # Items in the list are strings like these: *.local, 169.254/16 2554 if not value: continue 2555 2556 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2557 if m is not None: 2558 if hostIP is None: 2559 try: 2560 hostIP = socket.gethostbyname(hostonly) 2561 hostIP = ip2num(hostIP) 2562 except OSError: 2563 continue 2564 2565 base = ip2num(m.group(1)) 2566 mask = m.group(2) 2567 if mask is None: 2568 mask = 8 * (m.group(1).count('.') + 1) 2569 else: 2570 mask = int(mask[1:]) 2571 mask = 32 - mask 2572 2573 if (hostIP >> mask) == (base >> mask): 2574 return True 2575 2576 elif fnmatch(host, value): 2577 return True 2578 2579 return False 2580 2581 2582if sys.platform == 'darwin': 2583 from _scproxy import _get_proxy_settings, _get_proxies 2584 2585 def proxy_bypass_macosx_sysconf(host): 2586 proxy_settings = _get_proxy_settings() 2587 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2588 2589 def getproxies_macosx_sysconf(): 2590 """Return a dictionary of scheme -> proxy server URL mappings. 2591 2592 This function uses the MacOSX framework SystemConfiguration 2593 to fetch the proxy information. 2594 """ 2595 return _get_proxies() 2596 2597 2598 2599 def proxy_bypass(host): 2600 """Return True, if host should be bypassed. 2601 2602 Checks proxy settings gathered from the environment, if specified, 2603 or from the MacOSX framework SystemConfiguration. 2604 2605 """ 2606 proxies = getproxies_environment() 2607 if proxies: 2608 return proxy_bypass_environment(host, proxies) 2609 else: 2610 return proxy_bypass_macosx_sysconf(host) 2611 2612 def getproxies(): 2613 return getproxies_environment() or getproxies_macosx_sysconf() 2614 2615 2616elif os.name == 'nt': 2617 def getproxies_registry(): 2618 """Return a dictionary of scheme -> proxy server URL mappings. 2619 2620 Win32 uses the registry to store proxies. 2621 2622 """ 2623 proxies = {} 2624 try: 2625 import winreg 2626 except ImportError: 2627 # Std module, so should be around - but you never know! 2628 return proxies 2629 try: 2630 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2631 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2632 proxyEnable = winreg.QueryValueEx(internetSettings, 2633 'ProxyEnable')[0] 2634 if proxyEnable: 2635 # Returned as Unicode but problems if not converted to ASCII 2636 proxyServer = str(winreg.QueryValueEx(internetSettings, 2637 'ProxyServer')[0]) 2638 if '=' in proxyServer: 2639 # Per-protocol settings 2640 for p in proxyServer.split(';'): 2641 protocol, address = p.split('=', 1) 2642 # See if address has a type:// prefix 2643 if not re.match('^([^/:]+)://', address): 2644 address = '%s://%s' % (protocol, address) 2645 proxies[protocol] = address 2646 else: 2647 # Use one setting for all protocols 2648 if proxyServer[:5] == 'http:': 2649 proxies['http'] = proxyServer 2650 else: 2651 proxies['http'] = 'http://%s' % proxyServer 2652 proxies['https'] = 'https://%s' % proxyServer 2653 proxies['ftp'] = 'ftp://%s' % proxyServer 2654 internetSettings.Close() 2655 except (OSError, ValueError, TypeError): 2656 # Either registry key not found etc, or the value in an 2657 # unexpected format. 2658 # proxies already set up to be empty so nothing to do 2659 pass 2660 return proxies 2661 2662 def getproxies(): 2663 """Return a dictionary of scheme -> proxy server URL mappings. 2664 2665 Returns settings gathered from the environment, if specified, 2666 or the registry. 2667 2668 """ 2669 return getproxies_environment() or getproxies_registry() 2670 2671 def proxy_bypass_registry(host): 2672 try: 2673 import winreg 2674 except ImportError: 2675 # Std modules, so should be around - but you never know! 2676 return 0 2677 try: 2678 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2679 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2680 proxyEnable = winreg.QueryValueEx(internetSettings, 2681 'ProxyEnable')[0] 2682 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2683 'ProxyOverride')[0]) 2684 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2685 except OSError: 2686 return 0 2687 if not proxyEnable or not proxyOverride: 2688 return 0 2689 # try to make a host list from name and IP address. 2690 rawHost, port = splitport(host) 2691 host = [rawHost] 2692 try: 2693 addr = socket.gethostbyname(rawHost) 2694 if addr != rawHost: 2695 host.append(addr) 2696 except OSError: 2697 pass 2698 try: 2699 fqdn = socket.getfqdn(rawHost) 2700 if fqdn != rawHost: 2701 host.append(fqdn) 2702 except OSError: 2703 pass 2704 # make a check value list from the registry entry: replace the 2705 # '<local>' string by the localhost entry and the corresponding 2706 # canonical entry. 2707 proxyOverride = proxyOverride.split(';') 2708 # now check if we match one of the registry values. 2709 for test in proxyOverride: 2710 if test == '<local>': 2711 if '.' not in rawHost: 2712 return 1 2713 test = test.replace(".", r"\.") # mask dots 2714 test = test.replace("*", r".*") # change glob sequence 2715 test = test.replace("?", r".") # change glob char 2716 for val in host: 2717 if re.match(test, val, re.I): 2718 return 1 2719 return 0 2720 2721 def proxy_bypass(host): 2722 """Return True, if host should be bypassed. 2723 2724 Checks proxy settings gathered from the environment, if specified, 2725 or the registry. 2726 2727 """ 2728 proxies = getproxies_environment() 2729 if proxies: 2730 return proxy_bypass_environment(host, proxies) 2731 else: 2732 return proxy_bypass_registry(host) 2733 2734else: 2735 # By default use environment variables 2736 getproxies = getproxies_environment 2737 proxy_bypass = proxy_bypass_environment 2738