1"""An extensible library for opening URLs using a variety of protocols 2 3The simplest way to use this module is to call the urlopen function, 4which accepts a string containing a URL or a Request object (described 5below). It opens the URL and returns the results as file-like 6object; the returned object has some extra methods described below. 7 8The OpenerDirector manages a collection of Handler objects that do 9all the actual work. Each Handler implements a particular protocol or 10option. The OpenerDirector is a composite object that invokes the 11Handlers needed to open the requested URL. For example, the 12HTTPHandler performs HTTP GET and POST requests and deals with 13non-error returns. The HTTPRedirectHandler automatically deals with 14HTTP 301, 302, 303, 307, and 308 redirect errors, and the 15HTTPDigestAuthHandler deals with digest authentication. 16 17urlopen(url, data=None) -- Basic usage is the same as original 18urllib. pass the url and optionally data to post to an HTTP URL, and 19get a file-like object back. One difference is that you can also pass 20a Request instance instead of URL. Raises a URLError (subclass of 21OSError); for HTTP errors, raises an HTTPError, which can also be 22treated as a valid response. 23 24build_opener -- Function that creates a new OpenerDirector instance. 25Will install the default handlers. Accepts one or more Handlers as 26arguments, either instances or Handler classes that it will 27instantiate. If one of the argument is a subclass of the default 28handler, the argument will be installed instead of the default. 29 30install_opener -- Installs a new opener as the default opener. 31 32objects of interest: 33 34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35the Handler classes, while dealing with requests and responses. 36 37Request -- An object that encapsulates the state of a request. The 38state can be as simple as the URL. It can also include extra HTTP 39headers, e.g. a User-Agent. 40 41BaseHandler -- 42 43internals: 44BaseHandler and parent 45_call_chain conventions 46 47Example usage: 48 49import urllib.request 50 51# set up authentication info 52authinfo = urllib.request.HTTPBasicAuthHandler() 53authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60# build a new opener that adds authentication and caching FTP handlers 61opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64# install it 65urllib.request.install_opener(opener) 66 67f = urllib.request.urlopen('https://www.python.org/') 68""" 69 70# XXX issues: 71# If an authentication error handler that tries to perform 72# authentication for some reason but fails, how should the error be 73# signalled? The client needs to know the HTTP error code. But if 74# the handler knows that the problem was, e.g., that it didn't know 75# that hash algo that requested in the challenge, it would be good to 76# pass that information along to the client, too. 77# ftp errors aren't handled cleanly 78# check digest against correct (i.e. non-apache) implementation 79 80# Possible extensions: 81# complex proxies XXX not sure what exactly was meant by this 82# abstract factory for opener 83 84import base64 85import bisect 86import email 87import hashlib 88import http.client 89import io 90import os 91import re 92import socket 93import string 94import sys 95import time 96import tempfile 97import contextlib 98import warnings 99 100 101from urllib.error import URLError, HTTPError, ContentTooShortError 102from urllib.parse import ( 103 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 104 _splittype, _splithost, _splitport, _splituser, _splitpasswd, 105 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, 106 unquote_to_bytes, urlunparse) 107from urllib.response import addinfourl, addclosehook 108 109# check for SSL 110try: 111 import ssl 112except ImportError: 113 _have_ssl = False 114else: 115 _have_ssl = True 116 117__all__ = [ 118 # Classes 119 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 120 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 121 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 122 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 123 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 124 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 125 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 126 'UnknownHandler', 'HTTPErrorProcessor', 127 # Functions 128 'urlopen', 'install_opener', 'build_opener', 129 'pathname2url', 'url2pathname', 'getproxies', 130 # Legacy interface 131 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 132] 133 134# used in User-Agent header sent 135__version__ = '%d.%d' % sys.version_info[:2] 136 137_opener = None 138def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 139 *, context=None): 140 '''Open the URL url, which can be either a string or a Request object. 141 142 *data* must be an object specifying additional data to be sent to 143 the server, or None if no such data is needed. See Request for 144 details. 145 146 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 147 header in its HTTP requests. 148 149 The optional *timeout* parameter specifies a timeout in seconds for 150 blocking operations like the connection attempt (if not specified, the 151 global default timeout setting will be used). This only works for HTTP, 152 HTTPS and FTP connections. 153 154 If *context* is specified, it must be a ssl.SSLContext instance describing 155 the various SSL options. See HTTPSConnection for more details. 156 157 158 This function always returns an object which can work as a 159 context manager and has the properties url, headers, and status. 160 See urllib.response.addinfourl for more detail on these properties. 161 162 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 163 object slightly modified. In addition to the three new methods above, the 164 msg attribute contains the same information as the reason attribute --- 165 the reason phrase returned by the server --- instead of the response 166 headers as it is specified in the documentation for HTTPResponse. 167 168 For FTP, file, and data URLs and requests explicitly handled by legacy 169 URLopener and FancyURLopener classes, this function returns a 170 urllib.response.addinfourl object. 171 172 Note that None may be returned if no handler handles the request (though 173 the default installed global OpenerDirector uses UnknownHandler to ensure 174 this never happens). 175 176 In addition, if proxy settings are detected (for example, when a *_proxy 177 environment variable like http_proxy is set), ProxyHandler is default 178 installed and makes sure the requests are handled through the proxy. 179 180 ''' 181 global _opener 182 if context: 183 https_handler = HTTPSHandler(context=context) 184 opener = build_opener(https_handler) 185 elif _opener is None: 186 _opener = opener = build_opener() 187 else: 188 opener = _opener 189 return opener.open(url, data, timeout) 190 191def install_opener(opener): 192 global _opener 193 _opener = opener 194 195_url_tempfiles = [] 196def urlretrieve(url, filename=None, reporthook=None, data=None): 197 """ 198 Retrieve a URL into a temporary location on disk. 199 200 Requires a URL argument. If a filename is passed, it is used as 201 the temporary file location. The reporthook argument should be 202 a callable that accepts a block number, a read size, and the 203 total file size of the URL target. The data argument should be 204 valid URL encoded data. 205 206 If a filename is passed and the URL points to a local resource, 207 the result is a copy from local file to new file. 208 209 Returns a tuple containing the path to the newly created 210 data file as well as the resulting HTTPMessage object. 211 """ 212 url_type, path = _splittype(url) 213 214 with contextlib.closing(urlopen(url, data)) as fp: 215 headers = fp.info() 216 217 # Just return the local path and the "headers" for file:// 218 # URLs. No sense in performing a copy unless requested. 219 if url_type == "file" and not filename: 220 return os.path.normpath(path), headers 221 222 # Handle temporary file setup. 223 if filename: 224 tfp = open(filename, 'wb') 225 else: 226 tfp = tempfile.NamedTemporaryFile(delete=False) 227 filename = tfp.name 228 _url_tempfiles.append(filename) 229 230 with tfp: 231 result = filename, headers 232 bs = 1024*8 233 size = -1 234 read = 0 235 blocknum = 0 236 if "content-length" in headers: 237 size = int(headers["Content-Length"]) 238 239 if reporthook: 240 reporthook(blocknum, bs, size) 241 242 while block := fp.read(bs): 243 read += len(block) 244 tfp.write(block) 245 blocknum += 1 246 if reporthook: 247 reporthook(blocknum, bs, size) 248 249 if size >= 0 and read < size: 250 raise ContentTooShortError( 251 "retrieval incomplete: got only %i out of %i bytes" 252 % (read, size), result) 253 254 return result 255 256def urlcleanup(): 257 """Clean up temporary files from urlretrieve calls.""" 258 for temp_file in _url_tempfiles: 259 try: 260 os.unlink(temp_file) 261 except OSError: 262 pass 263 264 del _url_tempfiles[:] 265 global _opener 266 if _opener: 267 _opener = None 268 269# copied from cookielib.py 270_cut_port_re = re.compile(r":\d+$", re.ASCII) 271def request_host(request): 272 """Return request-host, as defined by RFC 2965. 273 274 Variation from RFC: returned value is lowercased, for convenient 275 comparison. 276 277 """ 278 url = request.full_url 279 host = urlparse(url)[1] 280 if host == "": 281 host = request.get_header("Host", "") 282 283 # remove port, if present 284 host = _cut_port_re.sub("", host, 1) 285 return host.lower() 286 287class Request: 288 289 def __init__(self, url, data=None, headers={}, 290 origin_req_host=None, unverifiable=False, 291 method=None): 292 self.full_url = url 293 self.headers = {} 294 self.unredirected_hdrs = {} 295 self._data = None 296 self.data = data 297 self._tunnel_host = None 298 for key, value in headers.items(): 299 self.add_header(key, value) 300 if origin_req_host is None: 301 origin_req_host = request_host(self) 302 self.origin_req_host = origin_req_host 303 self.unverifiable = unverifiable 304 if method: 305 self.method = method 306 307 @property 308 def full_url(self): 309 if self.fragment: 310 return '{}#{}'.format(self._full_url, self.fragment) 311 return self._full_url 312 313 @full_url.setter 314 def full_url(self, url): 315 # unwrap('<URL:type://host/path>') --> 'type://host/path' 316 self._full_url = unwrap(url) 317 self._full_url, self.fragment = _splittag(self._full_url) 318 self._parse() 319 320 @full_url.deleter 321 def full_url(self): 322 self._full_url = None 323 self.fragment = None 324 self.selector = '' 325 326 @property 327 def data(self): 328 return self._data 329 330 @data.setter 331 def data(self, data): 332 if data != self._data: 333 self._data = data 334 # issue 16464 335 # if we change data we need to remove content-length header 336 # (cause it's most probably calculated for previous value) 337 if self.has_header("Content-length"): 338 self.remove_header("Content-length") 339 340 @data.deleter 341 def data(self): 342 self.data = None 343 344 def _parse(self): 345 self.type, rest = _splittype(self._full_url) 346 if self.type is None: 347 raise ValueError("unknown url type: %r" % self.full_url) 348 self.host, self.selector = _splithost(rest) 349 if self.host: 350 self.host = unquote(self.host) 351 352 def get_method(self): 353 """Return a string indicating the HTTP request method.""" 354 default_method = "POST" if self.data is not None else "GET" 355 return getattr(self, 'method', default_method) 356 357 def get_full_url(self): 358 return self.full_url 359 360 def set_proxy(self, host, type): 361 if self.type == 'https' and not self._tunnel_host: 362 self._tunnel_host = self.host 363 else: 364 self.type= type 365 self.selector = self.full_url 366 self.host = host 367 368 def has_proxy(self): 369 return self.selector == self.full_url 370 371 def add_header(self, key, val): 372 # useful for something like authentication 373 self.headers[key.capitalize()] = val 374 375 def add_unredirected_header(self, key, val): 376 # will not be added to a redirected request 377 self.unredirected_hdrs[key.capitalize()] = val 378 379 def has_header(self, header_name): 380 return (header_name in self.headers or 381 header_name in self.unredirected_hdrs) 382 383 def get_header(self, header_name, default=None): 384 return self.headers.get( 385 header_name, 386 self.unredirected_hdrs.get(header_name, default)) 387 388 def remove_header(self, header_name): 389 self.headers.pop(header_name, None) 390 self.unredirected_hdrs.pop(header_name, None) 391 392 def header_items(self): 393 hdrs = {**self.unredirected_hdrs, **self.headers} 394 return list(hdrs.items()) 395 396class OpenerDirector: 397 def __init__(self): 398 client_version = "Python-urllib/%s" % __version__ 399 self.addheaders = [('User-agent', client_version)] 400 # self.handlers is retained only for backward compatibility 401 self.handlers = [] 402 # manage the individual handlers 403 self.handle_open = {} 404 self.handle_error = {} 405 self.process_response = {} 406 self.process_request = {} 407 408 def add_handler(self, handler): 409 if not hasattr(handler, "add_parent"): 410 raise TypeError("expected BaseHandler instance, got %r" % 411 type(handler)) 412 413 added = False 414 for meth in dir(handler): 415 if meth in ["redirect_request", "do_open", "proxy_open"]: 416 # oops, coincidental match 417 continue 418 419 i = meth.find("_") 420 protocol = meth[:i] 421 condition = meth[i+1:] 422 423 if condition.startswith("error"): 424 j = condition.find("_") + i + 1 425 kind = meth[j+1:] 426 try: 427 kind = int(kind) 428 except ValueError: 429 pass 430 lookup = self.handle_error.get(protocol, {}) 431 self.handle_error[protocol] = lookup 432 elif condition == "open": 433 kind = protocol 434 lookup = self.handle_open 435 elif condition == "response": 436 kind = protocol 437 lookup = self.process_response 438 elif condition == "request": 439 kind = protocol 440 lookup = self.process_request 441 else: 442 continue 443 444 handlers = lookup.setdefault(kind, []) 445 if handlers: 446 bisect.insort(handlers, handler) 447 else: 448 handlers.append(handler) 449 added = True 450 451 if added: 452 bisect.insort(self.handlers, handler) 453 handler.add_parent(self) 454 455 def close(self): 456 # Only exists for backwards compatibility. 457 pass 458 459 def _call_chain(self, chain, kind, meth_name, *args): 460 # Handlers raise an exception if no one else should try to handle 461 # the request, or return None if they can't but another handler 462 # could. Otherwise, they return the response. 463 handlers = chain.get(kind, ()) 464 for handler in handlers: 465 func = getattr(handler, meth_name) 466 result = func(*args) 467 if result is not None: 468 return result 469 470 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 471 # accept a URL or a Request object 472 if isinstance(fullurl, str): 473 req = Request(fullurl, data) 474 else: 475 req = fullurl 476 if data is not None: 477 req.data = data 478 479 req.timeout = timeout 480 protocol = req.type 481 482 # pre-process request 483 meth_name = protocol+"_request" 484 for processor in self.process_request.get(protocol, []): 485 meth = getattr(processor, meth_name) 486 req = meth(req) 487 488 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) 489 response = self._open(req, data) 490 491 # post-process response 492 meth_name = protocol+"_response" 493 for processor in self.process_response.get(protocol, []): 494 meth = getattr(processor, meth_name) 495 response = meth(req, response) 496 497 return response 498 499 def _open(self, req, data=None): 500 result = self._call_chain(self.handle_open, 'default', 501 'default_open', req) 502 if result: 503 return result 504 505 protocol = req.type 506 result = self._call_chain(self.handle_open, protocol, protocol + 507 '_open', req) 508 if result: 509 return result 510 511 return self._call_chain(self.handle_open, 'unknown', 512 'unknown_open', req) 513 514 def error(self, proto, *args): 515 if proto in ('http', 'https'): 516 # XXX http[s] protocols are special-cased 517 dict = self.handle_error['http'] # https is not different than http 518 proto = args[2] # YUCK! 519 meth_name = 'http_error_%s' % proto 520 http_err = 1 521 orig_args = args 522 else: 523 dict = self.handle_error 524 meth_name = proto + '_error' 525 http_err = 0 526 args = (dict, proto, meth_name) + args 527 result = self._call_chain(*args) 528 if result: 529 return result 530 531 if http_err: 532 args = (dict, 'default', 'http_error_default') + orig_args 533 return self._call_chain(*args) 534 535# XXX probably also want an abstract factory that knows when it makes 536# sense to skip a superclass in favor of a subclass and when it might 537# make sense to include both 538 539def build_opener(*handlers): 540 """Create an opener object from a list of handlers. 541 542 The opener will use several default handlers, including support 543 for HTTP, FTP and when applicable HTTPS. 544 545 If any of the handlers passed as arguments are subclasses of the 546 default handlers, the default handlers will not be used. 547 """ 548 opener = OpenerDirector() 549 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 550 HTTPDefaultErrorHandler, HTTPRedirectHandler, 551 FTPHandler, FileHandler, HTTPErrorProcessor, 552 DataHandler] 553 if hasattr(http.client, "HTTPSConnection"): 554 default_classes.append(HTTPSHandler) 555 skip = set() 556 for klass in default_classes: 557 for check in handlers: 558 if isinstance(check, type): 559 if issubclass(check, klass): 560 skip.add(klass) 561 elif isinstance(check, klass): 562 skip.add(klass) 563 for klass in skip: 564 default_classes.remove(klass) 565 566 for klass in default_classes: 567 opener.add_handler(klass()) 568 569 for h in handlers: 570 if isinstance(h, type): 571 h = h() 572 opener.add_handler(h) 573 return opener 574 575class BaseHandler: 576 handler_order = 500 577 578 def add_parent(self, parent): 579 self.parent = parent 580 581 def close(self): 582 # Only exists for backwards compatibility 583 pass 584 585 def __lt__(self, other): 586 if not hasattr(other, "handler_order"): 587 # Try to preserve the old behavior of having custom classes 588 # inserted after default ones (works only for custom user 589 # classes which are not aware of handler_order). 590 return True 591 return self.handler_order < other.handler_order 592 593 594class HTTPErrorProcessor(BaseHandler): 595 """Process HTTP error responses.""" 596 handler_order = 1000 # after all other processing 597 598 def http_response(self, request, response): 599 code, msg, hdrs = response.code, response.msg, response.info() 600 601 # According to RFC 2616, "2xx" code indicates that the client's 602 # request was successfully received, understood, and accepted. 603 if not (200 <= code < 300): 604 response = self.parent.error( 605 'http', request, response, code, msg, hdrs) 606 607 return response 608 609 https_response = http_response 610 611class HTTPDefaultErrorHandler(BaseHandler): 612 def http_error_default(self, req, fp, code, msg, hdrs): 613 raise HTTPError(req.full_url, code, msg, hdrs, fp) 614 615class HTTPRedirectHandler(BaseHandler): 616 # maximum number of redirections to any single URL 617 # this is needed because of the state that cookies introduce 618 max_repeats = 4 619 # maximum total number of redirections (regardless of URL) before 620 # assuming we're in a loop 621 max_redirections = 10 622 623 def redirect_request(self, req, fp, code, msg, headers, newurl): 624 """Return a Request or None in response to a redirect. 625 626 This is called by the http_error_30x methods when a 627 redirection response is received. If a redirection should 628 take place, return a new Request to allow http_error_30x to 629 perform the redirect. Otherwise, raise HTTPError if no-one 630 else should try to handle this url. Return None if you can't 631 but another Handler might. 632 """ 633 m = req.get_method() 634 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") 635 or code in (301, 302, 303) and m == "POST")): 636 raise HTTPError(req.full_url, code, msg, headers, fp) 637 638 # Strictly (according to RFC 2616), 301 or 302 in response to 639 # a POST MUST NOT cause a redirection without confirmation 640 # from the user (of urllib.request, in this case). In practice, 641 # essentially all clients do redirect in this case, so we do 642 # the same. 643 644 # Be conciliant with URIs containing a space. This is mainly 645 # redundant with the more complete encoding done in http_error_302(), 646 # but it is kept for compatibility with other callers. 647 newurl = newurl.replace(' ', '%20') 648 649 CONTENT_HEADERS = ("content-length", "content-type") 650 newheaders = {k: v for k, v in req.headers.items() 651 if k.lower() not in CONTENT_HEADERS} 652 return Request(newurl, 653 method="HEAD" if m == "HEAD" else "GET", 654 headers=newheaders, 655 origin_req_host=req.origin_req_host, 656 unverifiable=True) 657 658 # Implementation note: To avoid the server sending us into an 659 # infinite loop, the request object needs to track what URLs we 660 # have already seen. Do this by adding a handler-specific 661 # attribute to the Request object. 662 def http_error_302(self, req, fp, code, msg, headers): 663 # Some servers (incorrectly) return multiple Location headers 664 # (so probably same goes for URI). Use first header. 665 if "location" in headers: 666 newurl = headers["location"] 667 elif "uri" in headers: 668 newurl = headers["uri"] 669 else: 670 return 671 672 # fix a possible malformed URL 673 urlparts = urlparse(newurl) 674 675 # For security reasons we don't allow redirection to anything other 676 # than http, https or ftp. 677 678 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 679 raise HTTPError( 680 newurl, code, 681 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 682 headers, fp) 683 684 if not urlparts.path and urlparts.netloc: 685 urlparts = list(urlparts) 686 urlparts[2] = "/" 687 newurl = urlunparse(urlparts) 688 689 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 690 # original bytes and percent-encode non-ASCII bytes, and any special 691 # characters such as the space. 692 newurl = quote( 693 newurl, encoding="iso-8859-1", safe=string.punctuation) 694 newurl = urljoin(req.full_url, newurl) 695 696 # XXX Probably want to forget about the state of the current 697 # request, although that might interact poorly with other 698 # handlers that also use handler-specific request attributes 699 new = self.redirect_request(req, fp, code, msg, headers, newurl) 700 if new is None: 701 return 702 703 # loop detection 704 # .redirect_dict has a key url if url was previously visited. 705 if hasattr(req, 'redirect_dict'): 706 visited = new.redirect_dict = req.redirect_dict 707 if (visited.get(newurl, 0) >= self.max_repeats or 708 len(visited) >= self.max_redirections): 709 raise HTTPError(req.full_url, code, 710 self.inf_msg + msg, headers, fp) 711 else: 712 visited = new.redirect_dict = req.redirect_dict = {} 713 visited[newurl] = visited.get(newurl, 0) + 1 714 715 # Don't close the fp until we are sure that we won't use it 716 # with HTTPError. 717 fp.read() 718 fp.close() 719 720 return self.parent.open(new, timeout=req.timeout) 721 722 http_error_301 = http_error_303 = http_error_307 = http_error_308 = http_error_302 723 724 inf_msg = "The HTTP server returned a redirect error that would " \ 725 "lead to an infinite loop.\n" \ 726 "The last 30x error message was:\n" 727 728 729def _parse_proxy(proxy): 730 """Return (scheme, user, password, host/port) given a URL or an authority. 731 732 If a URL is supplied, it must have an authority (host:port) component. 733 According to RFC 3986, having an authority component means the URL must 734 have two slashes after the scheme. 735 """ 736 scheme, r_scheme = _splittype(proxy) 737 if not r_scheme.startswith("/"): 738 # authority 739 scheme = None 740 authority = proxy 741 else: 742 # URL 743 if not r_scheme.startswith("//"): 744 raise ValueError("proxy URL with no authority: %r" % proxy) 745 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 746 # and 3.3.), path is empty or starts with '/' 747 if '@' in r_scheme: 748 host_separator = r_scheme.find('@') 749 end = r_scheme.find("/", host_separator) 750 else: 751 end = r_scheme.find("/", 2) 752 if end == -1: 753 end = None 754 authority = r_scheme[2:end] 755 userinfo, hostport = _splituser(authority) 756 if userinfo is not None: 757 user, password = _splitpasswd(userinfo) 758 else: 759 user = password = None 760 return scheme, user, password, hostport 761 762class ProxyHandler(BaseHandler): 763 # Proxies must be in front 764 handler_order = 100 765 766 def __init__(self, proxies=None): 767 if proxies is None: 768 proxies = getproxies() 769 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 770 self.proxies = proxies 771 for type, url in proxies.items(): 772 type = type.lower() 773 setattr(self, '%s_open' % type, 774 lambda r, proxy=url, type=type, meth=self.proxy_open: 775 meth(r, proxy, type)) 776 777 def proxy_open(self, req, proxy, type): 778 orig_type = req.type 779 proxy_type, user, password, hostport = _parse_proxy(proxy) 780 if proxy_type is None: 781 proxy_type = orig_type 782 783 if req.host and proxy_bypass(req.host): 784 return None 785 786 if user and password: 787 user_pass = '%s:%s' % (unquote(user), 788 unquote(password)) 789 creds = base64.b64encode(user_pass.encode()).decode("ascii") 790 req.add_header('Proxy-authorization', 'Basic ' + creds) 791 hostport = unquote(hostport) 792 req.set_proxy(hostport, proxy_type) 793 if orig_type == proxy_type or orig_type == 'https': 794 # let other handlers take care of it 795 return None 796 else: 797 # need to start over, because the other handlers don't 798 # grok the proxy's URL type 799 # e.g. if we have a constructor arg proxies like so: 800 # {'http': 'ftp://proxy.example.com'}, we may end up turning 801 # a request for http://acme.example.com/a into one for 802 # ftp://proxy.example.com/a 803 return self.parent.open(req, timeout=req.timeout) 804 805class HTTPPasswordMgr: 806 807 def __init__(self): 808 self.passwd = {} 809 810 def add_password(self, realm, uri, user, passwd): 811 # uri could be a single URI or a sequence 812 if isinstance(uri, str): 813 uri = [uri] 814 if realm not in self.passwd: 815 self.passwd[realm] = {} 816 for default_port in True, False: 817 reduced_uri = tuple( 818 self.reduce_uri(u, default_port) for u in uri) 819 self.passwd[realm][reduced_uri] = (user, passwd) 820 821 def find_user_password(self, realm, authuri): 822 domains = self.passwd.get(realm, {}) 823 for default_port in True, False: 824 reduced_authuri = self.reduce_uri(authuri, default_port) 825 for uris, authinfo in domains.items(): 826 for uri in uris: 827 if self.is_suburi(uri, reduced_authuri): 828 return authinfo 829 return None, None 830 831 def reduce_uri(self, uri, default_port=True): 832 """Accept authority or URI and extract only the authority and path.""" 833 # note HTTP URLs do not have a userinfo component 834 parts = urlsplit(uri) 835 if parts[1]: 836 # URI 837 scheme = parts[0] 838 authority = parts[1] 839 path = parts[2] or '/' 840 else: 841 # host or host:port 842 scheme = None 843 authority = uri 844 path = '/' 845 host, port = _splitport(authority) 846 if default_port and port is None and scheme is not None: 847 dport = {"http": 80, 848 "https": 443, 849 }.get(scheme) 850 if dport is not None: 851 authority = "%s:%d" % (host, dport) 852 return authority, path 853 854 def is_suburi(self, base, test): 855 """Check if test is below base in a URI tree 856 857 Both args must be URIs in reduced form. 858 """ 859 if base == test: 860 return True 861 if base[0] != test[0]: 862 return False 863 prefix = base[1] 864 if prefix[-1:] != '/': 865 prefix += '/' 866 return test[1].startswith(prefix) 867 868 869class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 870 871 def find_user_password(self, realm, authuri): 872 user, password = HTTPPasswordMgr.find_user_password(self, realm, 873 authuri) 874 if user is not None: 875 return user, password 876 return HTTPPasswordMgr.find_user_password(self, None, authuri) 877 878 879class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 880 881 def __init__(self, *args, **kwargs): 882 self.authenticated = {} 883 super().__init__(*args, **kwargs) 884 885 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 886 self.update_authenticated(uri, is_authenticated) 887 # Add a default for prior auth requests 888 if realm is not None: 889 super().add_password(None, uri, user, passwd) 890 super().add_password(realm, uri, user, passwd) 891 892 def update_authenticated(self, uri, is_authenticated=False): 893 # uri could be a single URI or a sequence 894 if isinstance(uri, str): 895 uri = [uri] 896 897 for default_port in True, False: 898 for u in uri: 899 reduced_uri = self.reduce_uri(u, default_port) 900 self.authenticated[reduced_uri] = is_authenticated 901 902 def is_authenticated(self, authuri): 903 for default_port in True, False: 904 reduced_authuri = self.reduce_uri(authuri, default_port) 905 for uri in self.authenticated: 906 if self.is_suburi(uri, reduced_authuri): 907 return self.authenticated[uri] 908 909 910class AbstractBasicAuthHandler: 911 912 # XXX this allows for multiple auth-schemes, but will stupidly pick 913 # the last one with a realm specified. 914 915 # allow for double- and single-quoted realm values 916 # (single quotes are a violation of the RFC, but appear in the wild) 917 rx = re.compile('(?:^|,)' # start of the string or ',' 918 '[ \t]*' # optional whitespaces 919 '([^ \t,]+)' # scheme like "Basic" 920 '[ \t]+' # mandatory whitespaces 921 # realm=xxx 922 # realm='xxx' 923 # realm="xxx" 924 'realm=(["\']?)([^"\']*)\\2', 925 re.I) 926 927 # XXX could pre-emptively send auth info already accepted (RFC 2617, 928 # end of section 2, and section 1.2 immediately after "credentials" 929 # production). 930 931 def __init__(self, password_mgr=None): 932 if password_mgr is None: 933 password_mgr = HTTPPasswordMgr() 934 self.passwd = password_mgr 935 self.add_password = self.passwd.add_password 936 937 def _parse_realm(self, header): 938 # parse WWW-Authenticate header: accept multiple challenges per header 939 found_challenge = False 940 for mo in AbstractBasicAuthHandler.rx.finditer(header): 941 scheme, quote, realm = mo.groups() 942 if quote not in ['"', "'"]: 943 warnings.warn("Basic Auth Realm was unquoted", 944 UserWarning, 3) 945 946 yield (scheme, realm) 947 948 found_challenge = True 949 950 if not found_challenge: 951 if header: 952 scheme = header.split()[0] 953 else: 954 scheme = '' 955 yield (scheme, None) 956 957 def http_error_auth_reqed(self, authreq, host, req, headers): 958 # host may be an authority (without userinfo) or a URL with an 959 # authority 960 headers = headers.get_all(authreq) 961 if not headers: 962 # no header found 963 return 964 965 unsupported = None 966 for header in headers: 967 for scheme, realm in self._parse_realm(header): 968 if scheme.lower() != 'basic': 969 unsupported = scheme 970 continue 971 972 if realm is not None: 973 # Use the first matching Basic challenge. 974 # Ignore following challenges even if they use the Basic 975 # scheme. 976 return self.retry_http_basic_auth(host, req, realm) 977 978 if unsupported is not None: 979 raise ValueError("AbstractBasicAuthHandler does not " 980 "support the following scheme: %r" 981 % (scheme,)) 982 983 def retry_http_basic_auth(self, host, req, realm): 984 user, pw = self.passwd.find_user_password(realm, host) 985 if pw is not None: 986 raw = "%s:%s" % (user, pw) 987 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 988 if req.get_header(self.auth_header, None) == auth: 989 return None 990 req.add_unredirected_header(self.auth_header, auth) 991 return self.parent.open(req, timeout=req.timeout) 992 else: 993 return None 994 995 def http_request(self, req): 996 if (not hasattr(self.passwd, 'is_authenticated') or 997 not self.passwd.is_authenticated(req.full_url)): 998 return req 999 1000 if not req.has_header('Authorization'): 1001 user, passwd = self.passwd.find_user_password(None, req.full_url) 1002 credentials = '{0}:{1}'.format(user, passwd).encode() 1003 auth_str = base64.standard_b64encode(credentials).decode() 1004 req.add_unredirected_header('Authorization', 1005 'Basic {}'.format(auth_str.strip())) 1006 return req 1007 1008 def http_response(self, req, response): 1009 if hasattr(self.passwd, 'is_authenticated'): 1010 if 200 <= response.code < 300: 1011 self.passwd.update_authenticated(req.full_url, True) 1012 else: 1013 self.passwd.update_authenticated(req.full_url, False) 1014 return response 1015 1016 https_request = http_request 1017 https_response = http_response 1018 1019 1020 1021class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1022 1023 auth_header = 'Authorization' 1024 1025 def http_error_401(self, req, fp, code, msg, headers): 1026 url = req.full_url 1027 response = self.http_error_auth_reqed('www-authenticate', 1028 url, req, headers) 1029 return response 1030 1031 1032class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1033 1034 auth_header = 'Proxy-authorization' 1035 1036 def http_error_407(self, req, fp, code, msg, headers): 1037 # http_error_auth_reqed requires that there is no userinfo component in 1038 # authority. Assume there isn't one, since urllib.request does not (and 1039 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1040 # userinfo. 1041 authority = req.host 1042 response = self.http_error_auth_reqed('proxy-authenticate', 1043 authority, req, headers) 1044 return response 1045 1046 1047# Return n random bytes. 1048_randombytes = os.urandom 1049 1050 1051class AbstractDigestAuthHandler: 1052 # Digest authentication is specified in RFC 2617. 1053 1054 # XXX The client does not inspect the Authentication-Info header 1055 # in a successful response. 1056 1057 # XXX It should be possible to test this implementation against 1058 # a mock server that just generates a static set of challenges. 1059 1060 # XXX qop="auth-int" supports is shaky 1061 1062 def __init__(self, passwd=None): 1063 if passwd is None: 1064 passwd = HTTPPasswordMgr() 1065 self.passwd = passwd 1066 self.add_password = self.passwd.add_password 1067 self.retried = 0 1068 self.nonce_count = 0 1069 self.last_nonce = None 1070 1071 def reset_retry_count(self): 1072 self.retried = 0 1073 1074 def http_error_auth_reqed(self, auth_header, host, req, headers): 1075 authreq = headers.get(auth_header, None) 1076 if self.retried > 5: 1077 # Don't fail endlessly - if we failed once, we'll probably 1078 # fail a second time. Hm. Unless the Password Manager is 1079 # prompting for the information. Crap. This isn't great 1080 # but it's better than the current 'repeat until recursion 1081 # depth exceeded' approach <wink> 1082 raise HTTPError(req.full_url, 401, "digest auth failed", 1083 headers, None) 1084 else: 1085 self.retried += 1 1086 if authreq: 1087 scheme = authreq.split()[0] 1088 if scheme.lower() == 'digest': 1089 return self.retry_http_digest_auth(req, authreq) 1090 elif scheme.lower() != 'basic': 1091 raise ValueError("AbstractDigestAuthHandler does not support" 1092 " the following scheme: '%s'" % scheme) 1093 1094 def retry_http_digest_auth(self, req, auth): 1095 token, challenge = auth.split(' ', 1) 1096 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1097 auth = self.get_authorization(req, chal) 1098 if auth: 1099 auth_val = 'Digest %s' % auth 1100 if req.headers.get(self.auth_header, None) == auth_val: 1101 return None 1102 req.add_unredirected_header(self.auth_header, auth_val) 1103 resp = self.parent.open(req, timeout=req.timeout) 1104 return resp 1105 1106 def get_cnonce(self, nonce): 1107 # The cnonce-value is an opaque 1108 # quoted string value provided by the client and used by both client 1109 # and server to avoid chosen plaintext attacks, to provide mutual 1110 # authentication, and to provide some message integrity protection. 1111 # This isn't a fabulous effort, but it's probably Good Enough. 1112 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1113 b = s.encode("ascii") + _randombytes(8) 1114 dig = hashlib.sha1(b).hexdigest() 1115 return dig[:16] 1116 1117 def get_authorization(self, req, chal): 1118 try: 1119 realm = chal['realm'] 1120 nonce = chal['nonce'] 1121 qop = chal.get('qop') 1122 algorithm = chal.get('algorithm', 'MD5') 1123 # mod_digest doesn't send an opaque, even though it isn't 1124 # supposed to be optional 1125 opaque = chal.get('opaque', None) 1126 except KeyError: 1127 return None 1128 1129 H, KD = self.get_algorithm_impls(algorithm) 1130 if H is None: 1131 return None 1132 1133 user, pw = self.passwd.find_user_password(realm, req.full_url) 1134 if user is None: 1135 return None 1136 1137 # XXX not implemented yet 1138 if req.data is not None: 1139 entdig = self.get_entity_digest(req.data, chal) 1140 else: 1141 entdig = None 1142 1143 A1 = "%s:%s:%s" % (user, realm, pw) 1144 A2 = "%s:%s" % (req.get_method(), 1145 # XXX selector: what about proxies and full urls 1146 req.selector) 1147 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth` 1148 # or `auth-int` to the response back. we use `auth` to send the response back. 1149 if qop is None: 1150 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1151 elif 'auth' in qop.split(','): 1152 if nonce == self.last_nonce: 1153 self.nonce_count += 1 1154 else: 1155 self.nonce_count = 1 1156 self.last_nonce = nonce 1157 ncvalue = '%08x' % self.nonce_count 1158 cnonce = self.get_cnonce(nonce) 1159 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2)) 1160 respdig = KD(H(A1), noncebit) 1161 else: 1162 # XXX handle auth-int. 1163 raise URLError("qop '%s' is not supported." % qop) 1164 1165 # XXX should the partial digests be encoded too? 1166 1167 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1168 'response="%s"' % (user, realm, nonce, req.selector, 1169 respdig) 1170 if opaque: 1171 base += ', opaque="%s"' % opaque 1172 if entdig: 1173 base += ', digest="%s"' % entdig 1174 base += ', algorithm="%s"' % algorithm 1175 if qop: 1176 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1177 return base 1178 1179 def get_algorithm_impls(self, algorithm): 1180 # lambdas assume digest modules are imported at the top level 1181 if algorithm == 'MD5': 1182 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1183 elif algorithm == 'SHA': 1184 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1185 # XXX MD5-sess 1186 else: 1187 raise ValueError("Unsupported digest authentication " 1188 "algorithm %r" % algorithm) 1189 KD = lambda s, d: H("%s:%s" % (s, d)) 1190 return H, KD 1191 1192 def get_entity_digest(self, data, chal): 1193 # XXX not implemented yet 1194 return None 1195 1196 1197class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1198 """An authentication protocol defined by RFC 2069 1199 1200 Digest authentication improves on basic authentication because it 1201 does not transmit passwords in the clear. 1202 """ 1203 1204 auth_header = 'Authorization' 1205 handler_order = 490 # before Basic auth 1206 1207 def http_error_401(self, req, fp, code, msg, headers): 1208 host = urlparse(req.full_url)[1] 1209 retry = self.http_error_auth_reqed('www-authenticate', 1210 host, req, headers) 1211 self.reset_retry_count() 1212 return retry 1213 1214 1215class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1216 1217 auth_header = 'Proxy-Authorization' 1218 handler_order = 490 # before Basic auth 1219 1220 def http_error_407(self, req, fp, code, msg, headers): 1221 host = req.host 1222 retry = self.http_error_auth_reqed('proxy-authenticate', 1223 host, req, headers) 1224 self.reset_retry_count() 1225 return retry 1226 1227class AbstractHTTPHandler(BaseHandler): 1228 1229 def __init__(self, debuglevel=None): 1230 self._debuglevel = debuglevel if debuglevel is not None else http.client.HTTPConnection.debuglevel 1231 1232 def set_http_debuglevel(self, level): 1233 self._debuglevel = level 1234 1235 def _get_content_length(self, request): 1236 return http.client.HTTPConnection._get_content_length( 1237 request.data, 1238 request.get_method()) 1239 1240 def do_request_(self, request): 1241 host = request.host 1242 if not host: 1243 raise URLError('no host given') 1244 1245 if request.data is not None: # POST 1246 data = request.data 1247 if isinstance(data, str): 1248 msg = "POST data should be bytes, an iterable of bytes, " \ 1249 "or a file object. It cannot be of type str." 1250 raise TypeError(msg) 1251 if not request.has_header('Content-type'): 1252 request.add_unredirected_header( 1253 'Content-type', 1254 'application/x-www-form-urlencoded') 1255 if (not request.has_header('Content-length') 1256 and not request.has_header('Transfer-encoding')): 1257 content_length = self._get_content_length(request) 1258 if content_length is not None: 1259 request.add_unredirected_header( 1260 'Content-length', str(content_length)) 1261 else: 1262 request.add_unredirected_header( 1263 'Transfer-encoding', 'chunked') 1264 1265 sel_host = host 1266 if request.has_proxy(): 1267 scheme, sel = _splittype(request.selector) 1268 sel_host, sel_path = _splithost(sel) 1269 if not request.has_header('Host'): 1270 request.add_unredirected_header('Host', sel_host) 1271 for name, value in self.parent.addheaders: 1272 name = name.capitalize() 1273 if not request.has_header(name): 1274 request.add_unredirected_header(name, value) 1275 1276 return request 1277 1278 def do_open(self, http_class, req, **http_conn_args): 1279 """Return an HTTPResponse object for the request, using http_class. 1280 1281 http_class must implement the HTTPConnection API from http.client. 1282 """ 1283 host = req.host 1284 if not host: 1285 raise URLError('no host given') 1286 1287 # will parse host:port 1288 h = http_class(host, timeout=req.timeout, **http_conn_args) 1289 h.set_debuglevel(self._debuglevel) 1290 1291 headers = dict(req.unredirected_hdrs) 1292 headers.update({k: v for k, v in req.headers.items() 1293 if k not in headers}) 1294 1295 # TODO(jhylton): Should this be redesigned to handle 1296 # persistent connections? 1297 1298 # We want to make an HTTP/1.1 request, but the addinfourl 1299 # class isn't prepared to deal with a persistent connection. 1300 # It will try to read all remaining data from the socket, 1301 # which will block while the server waits for the next request. 1302 # So make sure the connection gets closed after the (only) 1303 # request. 1304 headers["Connection"] = "close" 1305 headers = {name.title(): val for name, val in headers.items()} 1306 1307 if req._tunnel_host: 1308 tunnel_headers = {} 1309 proxy_auth_hdr = "Proxy-Authorization" 1310 if proxy_auth_hdr in headers: 1311 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1312 # Proxy-Authorization should not be sent to origin 1313 # server. 1314 del headers[proxy_auth_hdr] 1315 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1316 1317 try: 1318 try: 1319 h.request(req.get_method(), req.selector, req.data, headers, 1320 encode_chunked=req.has_header('Transfer-encoding')) 1321 except OSError as err: # timeout error 1322 raise URLError(err) 1323 r = h.getresponse() 1324 except: 1325 h.close() 1326 raise 1327 1328 # If the server does not send us a 'Connection: close' header, 1329 # HTTPConnection assumes the socket should be left open. Manually 1330 # mark the socket to be closed when this response object goes away. 1331 if h.sock: 1332 h.sock.close() 1333 h.sock = None 1334 1335 r.url = req.get_full_url() 1336 # This line replaces the .msg attribute of the HTTPResponse 1337 # with .headers, because urllib clients expect the response to 1338 # have the reason in .msg. It would be good to mark this 1339 # attribute is deprecated and get then to use info() or 1340 # .headers. 1341 r.msg = r.reason 1342 return r 1343 1344 1345class HTTPHandler(AbstractHTTPHandler): 1346 1347 def http_open(self, req): 1348 return self.do_open(http.client.HTTPConnection, req) 1349 1350 http_request = AbstractHTTPHandler.do_request_ 1351 1352if hasattr(http.client, 'HTTPSConnection'): 1353 1354 class HTTPSHandler(AbstractHTTPHandler): 1355 1356 def __init__(self, debuglevel=None, context=None, check_hostname=None): 1357 debuglevel = debuglevel if debuglevel is not None else http.client.HTTPSConnection.debuglevel 1358 AbstractHTTPHandler.__init__(self, debuglevel) 1359 if context is None: 1360 http_version = http.client.HTTPSConnection._http_vsn 1361 context = http.client._create_https_context(http_version) 1362 if check_hostname is not None: 1363 context.check_hostname = check_hostname 1364 self._context = context 1365 1366 def https_open(self, req): 1367 return self.do_open(http.client.HTTPSConnection, req, 1368 context=self._context) 1369 1370 https_request = AbstractHTTPHandler.do_request_ 1371 1372 __all__.append('HTTPSHandler') 1373 1374class HTTPCookieProcessor(BaseHandler): 1375 def __init__(self, cookiejar=None): 1376 import http.cookiejar 1377 if cookiejar is None: 1378 cookiejar = http.cookiejar.CookieJar() 1379 self.cookiejar = cookiejar 1380 1381 def http_request(self, request): 1382 self.cookiejar.add_cookie_header(request) 1383 return request 1384 1385 def http_response(self, request, response): 1386 self.cookiejar.extract_cookies(response, request) 1387 return response 1388 1389 https_request = http_request 1390 https_response = http_response 1391 1392class UnknownHandler(BaseHandler): 1393 def unknown_open(self, req): 1394 type = req.type 1395 raise URLError('unknown url type: %s' % type) 1396 1397def parse_keqv_list(l): 1398 """Parse list of key=value strings where keys are not duplicated.""" 1399 parsed = {} 1400 for elt in l: 1401 k, v = elt.split('=', 1) 1402 if v[0] == '"' and v[-1] == '"': 1403 v = v[1:-1] 1404 parsed[k] = v 1405 return parsed 1406 1407def parse_http_list(s): 1408 """Parse lists as described by RFC 2068 Section 2. 1409 1410 In particular, parse comma-separated lists where the elements of 1411 the list may include quoted-strings. A quoted-string could 1412 contain a comma. A non-quoted string could have quotes in the 1413 middle. Neither commas nor quotes count if they are escaped. 1414 Only double-quotes count, not single-quotes. 1415 """ 1416 res = [] 1417 part = '' 1418 1419 escape = quote = False 1420 for cur in s: 1421 if escape: 1422 part += cur 1423 escape = False 1424 continue 1425 if quote: 1426 if cur == '\\': 1427 escape = True 1428 continue 1429 elif cur == '"': 1430 quote = False 1431 part += cur 1432 continue 1433 1434 if cur == ',': 1435 res.append(part) 1436 part = '' 1437 continue 1438 1439 if cur == '"': 1440 quote = True 1441 1442 part += cur 1443 1444 # append last part 1445 if part: 1446 res.append(part) 1447 1448 return [part.strip() for part in res] 1449 1450class FileHandler(BaseHandler): 1451 # Use local file or FTP depending on form of URL 1452 def file_open(self, req): 1453 url = req.selector 1454 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1455 req.host != 'localhost'): 1456 if not req.host in self.get_names(): 1457 raise URLError("file:// scheme is supported only on localhost") 1458 else: 1459 return self.open_local_file(req) 1460 1461 # names for the localhost 1462 names = None 1463 def get_names(self): 1464 if FileHandler.names is None: 1465 try: 1466 FileHandler.names = tuple( 1467 socket.gethostbyname_ex('localhost')[2] + 1468 socket.gethostbyname_ex(socket.gethostname())[2]) 1469 except socket.gaierror: 1470 FileHandler.names = (socket.gethostbyname('localhost'),) 1471 return FileHandler.names 1472 1473 # not entirely sure what the rules are here 1474 def open_local_file(self, req): 1475 import email.utils 1476 import mimetypes 1477 host = req.host 1478 filename = req.selector 1479 localfile = url2pathname(filename) 1480 try: 1481 stats = os.stat(localfile) 1482 size = stats.st_size 1483 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1484 mtype = mimetypes.guess_type(filename)[0] 1485 headers = email.message_from_string( 1486 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1487 (mtype or 'text/plain', size, modified)) 1488 if host: 1489 host, port = _splitport(host) 1490 if not host or \ 1491 (not port and _safe_gethostbyname(host) in self.get_names()): 1492 if host: 1493 origurl = 'file://' + host + filename 1494 else: 1495 origurl = 'file://' + filename 1496 return addinfourl(open(localfile, 'rb'), headers, origurl) 1497 except OSError as exp: 1498 raise URLError(exp) 1499 raise URLError('file not on local host') 1500 1501def _safe_gethostbyname(host): 1502 try: 1503 return socket.gethostbyname(host) 1504 except socket.gaierror: 1505 return None 1506 1507class FTPHandler(BaseHandler): 1508 def ftp_open(self, req): 1509 import ftplib 1510 import mimetypes 1511 host = req.host 1512 if not host: 1513 raise URLError('ftp error: no host given') 1514 host, port = _splitport(host) 1515 if port is None: 1516 port = ftplib.FTP_PORT 1517 else: 1518 port = int(port) 1519 1520 # username/password handling 1521 user, host = _splituser(host) 1522 if user: 1523 user, passwd = _splitpasswd(user) 1524 else: 1525 passwd = None 1526 host = unquote(host) 1527 user = user or '' 1528 passwd = passwd or '' 1529 1530 try: 1531 host = socket.gethostbyname(host) 1532 except OSError as msg: 1533 raise URLError(msg) 1534 path, attrs = _splitattr(req.selector) 1535 dirs = path.split('/') 1536 dirs = list(map(unquote, dirs)) 1537 dirs, file = dirs[:-1], dirs[-1] 1538 if dirs and not dirs[0]: 1539 dirs = dirs[1:] 1540 try: 1541 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1542 type = file and 'I' or 'D' 1543 for attr in attrs: 1544 attr, value = _splitvalue(attr) 1545 if attr.lower() == 'type' and \ 1546 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1547 type = value.upper() 1548 fp, retrlen = fw.retrfile(file, type) 1549 headers = "" 1550 mtype = mimetypes.guess_type(req.full_url)[0] 1551 if mtype: 1552 headers += "Content-type: %s\n" % mtype 1553 if retrlen is not None and retrlen >= 0: 1554 headers += "Content-length: %d\n" % retrlen 1555 headers = email.message_from_string(headers) 1556 return addinfourl(fp, headers, req.full_url) 1557 except ftplib.all_errors as exp: 1558 raise URLError(exp) from exp 1559 1560 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1561 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1562 persistent=False) 1563 1564class CacheFTPHandler(FTPHandler): 1565 # XXX would be nice to have pluggable cache strategies 1566 # XXX this stuff is definitely not thread safe 1567 def __init__(self): 1568 self.cache = {} 1569 self.timeout = {} 1570 self.soonest = 0 1571 self.delay = 60 1572 self.max_conns = 16 1573 1574 def setTimeout(self, t): 1575 self.delay = t 1576 1577 def setMaxConns(self, m): 1578 self.max_conns = m 1579 1580 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1581 key = user, host, port, '/'.join(dirs), timeout 1582 if key in self.cache: 1583 self.timeout[key] = time.time() + self.delay 1584 else: 1585 self.cache[key] = ftpwrapper(user, passwd, host, port, 1586 dirs, timeout) 1587 self.timeout[key] = time.time() + self.delay 1588 self.check_cache() 1589 return self.cache[key] 1590 1591 def check_cache(self): 1592 # first check for old ones 1593 t = time.time() 1594 if self.soonest <= t: 1595 for k, v in list(self.timeout.items()): 1596 if v < t: 1597 self.cache[k].close() 1598 del self.cache[k] 1599 del self.timeout[k] 1600 self.soonest = min(list(self.timeout.values())) 1601 1602 # then check the size 1603 if len(self.cache) == self.max_conns: 1604 for k, v in list(self.timeout.items()): 1605 if v == self.soonest: 1606 del self.cache[k] 1607 del self.timeout[k] 1608 break 1609 self.soonest = min(list(self.timeout.values())) 1610 1611 def clear_cache(self): 1612 for conn in self.cache.values(): 1613 conn.close() 1614 self.cache.clear() 1615 self.timeout.clear() 1616 1617class DataHandler(BaseHandler): 1618 def data_open(self, req): 1619 # data URLs as specified in RFC 2397. 1620 # 1621 # ignores POSTed data 1622 # 1623 # syntax: 1624 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1625 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1626 # data := *urlchar 1627 # parameter := attribute "=" value 1628 url = req.full_url 1629 1630 scheme, data = url.split(":",1) 1631 mediatype, data = data.split(",",1) 1632 1633 # even base64 encoded data URLs might be quoted so unquote in any case: 1634 data = unquote_to_bytes(data) 1635 if mediatype.endswith(";base64"): 1636 data = base64.decodebytes(data) 1637 mediatype = mediatype[:-7] 1638 1639 if not mediatype: 1640 mediatype = "text/plain;charset=US-ASCII" 1641 1642 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1643 (mediatype, len(data))) 1644 1645 return addinfourl(io.BytesIO(data), headers, url) 1646 1647 1648# Code move from the old urllib module 1649 1650MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1651 1652# Helper for non-unix systems 1653if os.name == 'nt': 1654 from nturl2path import url2pathname, pathname2url 1655else: 1656 def url2pathname(pathname): 1657 """OS-specific conversion from a relative URL of the 'file' scheme 1658 to a file system path; not recommended for general use.""" 1659 if pathname[:3] == '///': 1660 # URL has an empty authority section, so the path begins on the 1661 # third character. 1662 pathname = pathname[2:] 1663 elif pathname[:12] == '//localhost/': 1664 # Skip past 'localhost' authority. 1665 pathname = pathname[11:] 1666 encoding = sys.getfilesystemencoding() 1667 errors = sys.getfilesystemencodeerrors() 1668 return unquote(pathname, encoding=encoding, errors=errors) 1669 1670 def pathname2url(pathname): 1671 """OS-specific conversion from a file system path to a relative URL 1672 of the 'file' scheme; not recommended for general use.""" 1673 if pathname[:2] == '//': 1674 # Add explicitly empty authority to avoid interpreting the path 1675 # as authority. 1676 pathname = '//' + pathname 1677 encoding = sys.getfilesystemencoding() 1678 errors = sys.getfilesystemencodeerrors() 1679 return quote(pathname, encoding=encoding, errors=errors) 1680 1681 1682ftpcache = {} 1683 1684 1685class URLopener: 1686 """Class to open URLs. 1687 This is a class rather than just a subroutine because we may need 1688 more than one set of global protocol-specific options. 1689 Note -- this is a base class for those who don't want the 1690 automatic handling of errors type 302 (relocated) and 401 1691 (authorization needed).""" 1692 1693 __tempfiles = None 1694 1695 version = "Python-urllib/%s" % __version__ 1696 1697 # Constructor 1698 def __init__(self, proxies=None, **x509): 1699 msg = "%(class)s style of invoking requests is deprecated. " \ 1700 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1701 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1702 if proxies is None: 1703 proxies = getproxies() 1704 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1705 self.proxies = proxies 1706 self.key_file = x509.get('key_file') 1707 self.cert_file = x509.get('cert_file') 1708 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1709 self.__tempfiles = [] 1710 self.__unlink = os.unlink # See cleanup() 1711 self.tempcache = None 1712 # Undocumented feature: if you assign {} to tempcache, 1713 # it is used to cache files retrieved with 1714 # self.retrieve(). This is not enabled by default 1715 # since it does not work for changing documents (and I 1716 # haven't got the logic to check expiration headers 1717 # yet). 1718 self.ftpcache = ftpcache 1719 # Undocumented feature: you can use a different 1720 # ftp cache by assigning to the .ftpcache member; 1721 # in case you want logically independent URL openers 1722 # XXX This is not threadsafe. Bah. 1723 1724 def __del__(self): 1725 self.close() 1726 1727 def close(self): 1728 self.cleanup() 1729 1730 def cleanup(self): 1731 # This code sometimes runs when the rest of this module 1732 # has already been deleted, so it can't use any globals 1733 # or import anything. 1734 if self.__tempfiles: 1735 for file in self.__tempfiles: 1736 try: 1737 self.__unlink(file) 1738 except OSError: 1739 pass 1740 del self.__tempfiles[:] 1741 if self.tempcache: 1742 self.tempcache.clear() 1743 1744 def addheader(self, *args): 1745 """Add a header to be used by the HTTP interface only 1746 e.g. u.addheader('Accept', 'sound/basic')""" 1747 self.addheaders.append(args) 1748 1749 # External interface 1750 def open(self, fullurl, data=None): 1751 """Use URLopener().open(file) instead of open(file, 'r').""" 1752 fullurl = unwrap(_to_bytes(fullurl)) 1753 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1754 if self.tempcache and fullurl in self.tempcache: 1755 filename, headers = self.tempcache[fullurl] 1756 fp = open(filename, 'rb') 1757 return addinfourl(fp, headers, fullurl) 1758 urltype, url = _splittype(fullurl) 1759 if not urltype: 1760 urltype = 'file' 1761 if urltype in self.proxies: 1762 proxy = self.proxies[urltype] 1763 urltype, proxyhost = _splittype(proxy) 1764 host, selector = _splithost(proxyhost) 1765 url = (host, fullurl) # Signal special case to open_*() 1766 else: 1767 proxy = None 1768 name = 'open_' + urltype 1769 self.type = urltype 1770 name = name.replace('-', '_') 1771 if not hasattr(self, name) or name == 'open_local_file': 1772 if proxy: 1773 return self.open_unknown_proxy(proxy, fullurl, data) 1774 else: 1775 return self.open_unknown(fullurl, data) 1776 try: 1777 if data is None: 1778 return getattr(self, name)(url) 1779 else: 1780 return getattr(self, name)(url, data) 1781 except (HTTPError, URLError): 1782 raise 1783 except OSError as msg: 1784 raise OSError('socket error', msg) from msg 1785 1786 def open_unknown(self, fullurl, data=None): 1787 """Overridable interface to open unknown URL type.""" 1788 type, url = _splittype(fullurl) 1789 raise OSError('url error', 'unknown url type', type) 1790 1791 def open_unknown_proxy(self, proxy, fullurl, data=None): 1792 """Overridable interface to open unknown URL type.""" 1793 type, url = _splittype(fullurl) 1794 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1795 1796 # External interface 1797 def retrieve(self, url, filename=None, reporthook=None, data=None): 1798 """retrieve(url) returns (filename, headers) for a local object 1799 or (tempfilename, headers) for a remote object.""" 1800 url = unwrap(_to_bytes(url)) 1801 if self.tempcache and url in self.tempcache: 1802 return self.tempcache[url] 1803 type, url1 = _splittype(url) 1804 if filename is None and (not type or type == 'file'): 1805 try: 1806 fp = self.open_local_file(url1) 1807 hdrs = fp.info() 1808 fp.close() 1809 return url2pathname(_splithost(url1)[1]), hdrs 1810 except OSError: 1811 pass 1812 fp = self.open(url, data) 1813 try: 1814 headers = fp.info() 1815 if filename: 1816 tfp = open(filename, 'wb') 1817 else: 1818 garbage, path = _splittype(url) 1819 garbage, path = _splithost(path or "") 1820 path, garbage = _splitquery(path or "") 1821 path, garbage = _splitattr(path or "") 1822 suffix = os.path.splitext(path)[1] 1823 (fd, filename) = tempfile.mkstemp(suffix) 1824 self.__tempfiles.append(filename) 1825 tfp = os.fdopen(fd, 'wb') 1826 try: 1827 result = filename, headers 1828 if self.tempcache is not None: 1829 self.tempcache[url] = result 1830 bs = 1024*8 1831 size = -1 1832 read = 0 1833 blocknum = 0 1834 if "content-length" in headers: 1835 size = int(headers["Content-Length"]) 1836 if reporthook: 1837 reporthook(blocknum, bs, size) 1838 while block := fp.read(bs): 1839 read += len(block) 1840 tfp.write(block) 1841 blocknum += 1 1842 if reporthook: 1843 reporthook(blocknum, bs, size) 1844 finally: 1845 tfp.close() 1846 finally: 1847 fp.close() 1848 1849 # raise exception if actual size does not match content-length header 1850 if size >= 0 and read < size: 1851 raise ContentTooShortError( 1852 "retrieval incomplete: got only %i out of %i bytes" 1853 % (read, size), result) 1854 1855 return result 1856 1857 # Each method named open_<type> knows how to open that type of URL 1858 1859 def _open_generic_http(self, connection_factory, url, data): 1860 """Make an HTTP connection using connection_class. 1861 1862 This is an internal method that should be called from 1863 open_http() or open_https(). 1864 1865 Arguments: 1866 - connection_factory should take a host name and return an 1867 HTTPConnection instance. 1868 - url is the url to retrieval or a host, relative-path pair. 1869 - data is payload for a POST request or None. 1870 """ 1871 1872 user_passwd = None 1873 proxy_passwd= None 1874 if isinstance(url, str): 1875 host, selector = _splithost(url) 1876 if host: 1877 user_passwd, host = _splituser(host) 1878 host = unquote(host) 1879 realhost = host 1880 else: 1881 host, selector = url 1882 # check whether the proxy contains authorization information 1883 proxy_passwd, host = _splituser(host) 1884 # now we proceed with the url we want to obtain 1885 urltype, rest = _splittype(selector) 1886 url = rest 1887 user_passwd = None 1888 if urltype.lower() != 'http': 1889 realhost = None 1890 else: 1891 realhost, rest = _splithost(rest) 1892 if realhost: 1893 user_passwd, realhost = _splituser(realhost) 1894 if user_passwd: 1895 selector = "%s://%s%s" % (urltype, realhost, rest) 1896 if proxy_bypass(realhost): 1897 host = realhost 1898 1899 if not host: raise OSError('http error', 'no host given') 1900 1901 if proxy_passwd: 1902 proxy_passwd = unquote(proxy_passwd) 1903 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1904 else: 1905 proxy_auth = None 1906 1907 if user_passwd: 1908 user_passwd = unquote(user_passwd) 1909 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1910 else: 1911 auth = None 1912 http_conn = connection_factory(host) 1913 headers = {} 1914 if proxy_auth: 1915 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1916 if auth: 1917 headers["Authorization"] = "Basic %s" % auth 1918 if realhost: 1919 headers["Host"] = realhost 1920 1921 # Add Connection:close as we don't support persistent connections yet. 1922 # This helps in closing the socket and avoiding ResourceWarning 1923 1924 headers["Connection"] = "close" 1925 1926 for header, value in self.addheaders: 1927 headers[header] = value 1928 1929 if data is not None: 1930 headers["Content-Type"] = "application/x-www-form-urlencoded" 1931 http_conn.request("POST", selector, data, headers) 1932 else: 1933 http_conn.request("GET", selector, headers=headers) 1934 1935 try: 1936 response = http_conn.getresponse() 1937 except http.client.BadStatusLine: 1938 # something went wrong with the HTTP status line 1939 raise URLError("http protocol error: bad status line") 1940 1941 # According to RFC 2616, "2xx" code indicates that the client's 1942 # request was successfully received, understood, and accepted. 1943 if 200 <= response.status < 300: 1944 return addinfourl(response, response.msg, "http:" + url, 1945 response.status) 1946 else: 1947 return self.http_error( 1948 url, response.fp, 1949 response.status, response.reason, response.msg, data) 1950 1951 def open_http(self, url, data=None): 1952 """Use HTTP protocol.""" 1953 return self._open_generic_http(http.client.HTTPConnection, url, data) 1954 1955 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1956 """Handle http errors. 1957 1958 Derived class can override this, or provide specific handlers 1959 named http_error_DDD where DDD is the 3-digit error code.""" 1960 # First check if there's a specific handler for this error 1961 name = 'http_error_%d' % errcode 1962 if hasattr(self, name): 1963 method = getattr(self, name) 1964 if data is None: 1965 result = method(url, fp, errcode, errmsg, headers) 1966 else: 1967 result = method(url, fp, errcode, errmsg, headers, data) 1968 if result: return result 1969 return self.http_error_default(url, fp, errcode, errmsg, headers) 1970 1971 def http_error_default(self, url, fp, errcode, errmsg, headers): 1972 """Default error handler: close the connection and raise OSError.""" 1973 fp.close() 1974 raise HTTPError(url, errcode, errmsg, headers, None) 1975 1976 if _have_ssl: 1977 def _https_connection(self, host): 1978 if self.key_file or self.cert_file: 1979 http_version = http.client.HTTPSConnection._http_vsn 1980 context = http.client._create_https_context(http_version) 1981 context.load_cert_chain(self.cert_file, self.key_file) 1982 # cert and key file means the user wants to authenticate. 1983 # enable TLS 1.3 PHA implicitly even for custom contexts. 1984 if context.post_handshake_auth is not None: 1985 context.post_handshake_auth = True 1986 else: 1987 context = None 1988 return http.client.HTTPSConnection(host, context=context) 1989 1990 def open_https(self, url, data=None): 1991 """Use HTTPS protocol.""" 1992 return self._open_generic_http(self._https_connection, url, data) 1993 1994 def open_file(self, url): 1995 """Use local file or FTP depending on form of URL.""" 1996 if not isinstance(url, str): 1997 raise URLError('file error: proxy support for file protocol currently not implemented') 1998 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 1999 raise ValueError("file:// scheme is supported only on localhost") 2000 else: 2001 return self.open_local_file(url) 2002 2003 def open_local_file(self, url): 2004 """Use local file.""" 2005 import email.utils 2006 import mimetypes 2007 host, file = _splithost(url) 2008 localname = url2pathname(file) 2009 try: 2010 stats = os.stat(localname) 2011 except OSError as e: 2012 raise URLError(e.strerror, e.filename) 2013 size = stats.st_size 2014 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 2015 mtype = mimetypes.guess_type(url)[0] 2016 headers = email.message_from_string( 2017 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 2018 (mtype or 'text/plain', size, modified)) 2019 if not host: 2020 urlfile = file 2021 if file[:1] == '/': 2022 urlfile = 'file://' + file 2023 return addinfourl(open(localname, 'rb'), headers, urlfile) 2024 host, port = _splitport(host) 2025 if (not port 2026 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2027 urlfile = file 2028 if file[:1] == '/': 2029 urlfile = 'file://' + file 2030 elif file[:2] == './': 2031 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2032 return addinfourl(open(localname, 'rb'), headers, urlfile) 2033 raise URLError('local file error: not on local host') 2034 2035 def open_ftp(self, url): 2036 """Use FTP protocol.""" 2037 if not isinstance(url, str): 2038 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2039 import mimetypes 2040 host, path = _splithost(url) 2041 if not host: raise URLError('ftp error: no host given') 2042 host, port = _splitport(host) 2043 user, host = _splituser(host) 2044 if user: user, passwd = _splitpasswd(user) 2045 else: passwd = None 2046 host = unquote(host) 2047 user = unquote(user or '') 2048 passwd = unquote(passwd or '') 2049 host = socket.gethostbyname(host) 2050 if not port: 2051 import ftplib 2052 port = ftplib.FTP_PORT 2053 else: 2054 port = int(port) 2055 path, attrs = _splitattr(path) 2056 path = unquote(path) 2057 dirs = path.split('/') 2058 dirs, file = dirs[:-1], dirs[-1] 2059 if dirs and not dirs[0]: dirs = dirs[1:] 2060 if dirs and not dirs[0]: dirs[0] = '/' 2061 key = user, host, port, '/'.join(dirs) 2062 # XXX thread unsafe! 2063 if len(self.ftpcache) > MAXFTPCACHE: 2064 # Prune the cache, rather arbitrarily 2065 for k in list(self.ftpcache): 2066 if k != key: 2067 v = self.ftpcache[k] 2068 del self.ftpcache[k] 2069 v.close() 2070 try: 2071 if key not in self.ftpcache: 2072 self.ftpcache[key] = \ 2073 ftpwrapper(user, passwd, host, port, dirs) 2074 if not file: type = 'D' 2075 else: type = 'I' 2076 for attr in attrs: 2077 attr, value = _splitvalue(attr) 2078 if attr.lower() == 'type' and \ 2079 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2080 type = value.upper() 2081 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2082 mtype = mimetypes.guess_type("ftp:" + url)[0] 2083 headers = "" 2084 if mtype: 2085 headers += "Content-Type: %s\n" % mtype 2086 if retrlen is not None and retrlen >= 0: 2087 headers += "Content-Length: %d\n" % retrlen 2088 headers = email.message_from_string(headers) 2089 return addinfourl(fp, headers, "ftp:" + url) 2090 except ftperrors() as exp: 2091 raise URLError(f'ftp error: {exp}') from exp 2092 2093 def open_data(self, url, data=None): 2094 """Use "data" URL.""" 2095 if not isinstance(url, str): 2096 raise URLError('data error: proxy support for data protocol currently not implemented') 2097 # ignore POSTed data 2098 # 2099 # syntax of data URLs: 2100 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2101 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2102 # data := *urlchar 2103 # parameter := attribute "=" value 2104 try: 2105 [type, data] = url.split(',', 1) 2106 except ValueError: 2107 raise OSError('data error', 'bad data URL') 2108 if not type: 2109 type = 'text/plain;charset=US-ASCII' 2110 semi = type.rfind(';') 2111 if semi >= 0 and '=' not in type[semi:]: 2112 encoding = type[semi+1:] 2113 type = type[:semi] 2114 else: 2115 encoding = '' 2116 msg = [] 2117 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2118 time.gmtime(time.time()))) 2119 msg.append('Content-type: %s' % type) 2120 if encoding == 'base64': 2121 # XXX is this encoding/decoding ok? 2122 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2123 else: 2124 data = unquote(data) 2125 msg.append('Content-Length: %d' % len(data)) 2126 msg.append('') 2127 msg.append(data) 2128 msg = '\n'.join(msg) 2129 headers = email.message_from_string(msg) 2130 f = io.StringIO(msg) 2131 #f.fileno = None # needed for addinfourl 2132 return addinfourl(f, headers, url) 2133 2134 2135class FancyURLopener(URLopener): 2136 """Derived class with handlers for errors we can handle (perhaps).""" 2137 2138 def __init__(self, *args, **kwargs): 2139 URLopener.__init__(self, *args, **kwargs) 2140 self.auth_cache = {} 2141 self.tries = 0 2142 self.maxtries = 10 2143 2144 def http_error_default(self, url, fp, errcode, errmsg, headers): 2145 """Default error handling -- don't raise an exception.""" 2146 return addinfourl(fp, headers, "http:" + url, errcode) 2147 2148 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2149 """Error 302 -- relocated (temporarily).""" 2150 self.tries += 1 2151 try: 2152 if self.maxtries and self.tries >= self.maxtries: 2153 if hasattr(self, "http_error_500"): 2154 meth = self.http_error_500 2155 else: 2156 meth = self.http_error_default 2157 return meth(url, fp, 500, 2158 "Internal Server Error: Redirect Recursion", 2159 headers) 2160 result = self.redirect_internal(url, fp, errcode, errmsg, 2161 headers, data) 2162 return result 2163 finally: 2164 self.tries = 0 2165 2166 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2167 if 'location' in headers: 2168 newurl = headers['location'] 2169 elif 'uri' in headers: 2170 newurl = headers['uri'] 2171 else: 2172 return 2173 fp.close() 2174 2175 # In case the server sent a relative URL, join with original: 2176 newurl = urljoin(self.type + ":" + url, newurl) 2177 2178 urlparts = urlparse(newurl) 2179 2180 # For security reasons, we don't allow redirection to anything other 2181 # than http, https and ftp. 2182 2183 # We are using newer HTTPError with older redirect_internal method 2184 # This older method will get deprecated in 3.3 2185 2186 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2187 raise HTTPError(newurl, errcode, 2188 errmsg + 2189 " Redirection to url '%s' is not allowed." % newurl, 2190 headers, fp) 2191 2192 return self.open(newurl) 2193 2194 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2195 """Error 301 -- also relocated (permanently).""" 2196 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2197 2198 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2199 """Error 303 -- also relocated (essentially identical to 302).""" 2200 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2201 2202 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2203 """Error 307 -- relocated, but turn POST into error.""" 2204 if data is None: 2205 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2206 else: 2207 return self.http_error_default(url, fp, errcode, errmsg, headers) 2208 2209 def http_error_308(self, url, fp, errcode, errmsg, headers, data=None): 2210 """Error 308 -- relocated, but turn POST into error.""" 2211 if data is None: 2212 return self.http_error_301(url, fp, errcode, errmsg, headers, data) 2213 else: 2214 return self.http_error_default(url, fp, errcode, errmsg, headers) 2215 2216 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2217 retry=False): 2218 """Error 401 -- authentication required. 2219 This function supports Basic authentication only.""" 2220 if 'www-authenticate' not in headers: 2221 URLopener.http_error_default(self, url, fp, 2222 errcode, errmsg, headers) 2223 stuff = headers['www-authenticate'] 2224 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2225 if not match: 2226 URLopener.http_error_default(self, url, fp, 2227 errcode, errmsg, headers) 2228 scheme, realm = match.groups() 2229 if scheme.lower() != 'basic': 2230 URLopener.http_error_default(self, url, fp, 2231 errcode, errmsg, headers) 2232 if not retry: 2233 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2234 headers) 2235 name = 'retry_' + self.type + '_basic_auth' 2236 if data is None: 2237 return getattr(self,name)(url, realm) 2238 else: 2239 return getattr(self,name)(url, realm, data) 2240 2241 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2242 retry=False): 2243 """Error 407 -- proxy authentication required. 2244 This function supports Basic authentication only.""" 2245 if 'proxy-authenticate' not in headers: 2246 URLopener.http_error_default(self, url, fp, 2247 errcode, errmsg, headers) 2248 stuff = headers['proxy-authenticate'] 2249 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2250 if not match: 2251 URLopener.http_error_default(self, url, fp, 2252 errcode, errmsg, headers) 2253 scheme, realm = match.groups() 2254 if scheme.lower() != 'basic': 2255 URLopener.http_error_default(self, url, fp, 2256 errcode, errmsg, headers) 2257 if not retry: 2258 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2259 headers) 2260 name = 'retry_proxy_' + self.type + '_basic_auth' 2261 if data is None: 2262 return getattr(self,name)(url, realm) 2263 else: 2264 return getattr(self,name)(url, realm, data) 2265 2266 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2267 host, selector = _splithost(url) 2268 newurl = 'http://' + host + selector 2269 proxy = self.proxies['http'] 2270 urltype, proxyhost = _splittype(proxy) 2271 proxyhost, proxyselector = _splithost(proxyhost) 2272 i = proxyhost.find('@') + 1 2273 proxyhost = proxyhost[i:] 2274 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2275 if not (user or passwd): return None 2276 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2277 quote(passwd, safe=''), proxyhost) 2278 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2279 if data is None: 2280 return self.open(newurl) 2281 else: 2282 return self.open(newurl, data) 2283 2284 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2285 host, selector = _splithost(url) 2286 newurl = 'https://' + host + selector 2287 proxy = self.proxies['https'] 2288 urltype, proxyhost = _splittype(proxy) 2289 proxyhost, proxyselector = _splithost(proxyhost) 2290 i = proxyhost.find('@') + 1 2291 proxyhost = proxyhost[i:] 2292 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2293 if not (user or passwd): return None 2294 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2295 quote(passwd, safe=''), proxyhost) 2296 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2297 if data is None: 2298 return self.open(newurl) 2299 else: 2300 return self.open(newurl, data) 2301 2302 def retry_http_basic_auth(self, url, realm, data=None): 2303 host, selector = _splithost(url) 2304 i = host.find('@') + 1 2305 host = host[i:] 2306 user, passwd = self.get_user_passwd(host, realm, i) 2307 if not (user or passwd): return None 2308 host = "%s:%s@%s" % (quote(user, safe=''), 2309 quote(passwd, safe=''), host) 2310 newurl = 'http://' + host + selector 2311 if data is None: 2312 return self.open(newurl) 2313 else: 2314 return self.open(newurl, data) 2315 2316 def retry_https_basic_auth(self, url, realm, data=None): 2317 host, selector = _splithost(url) 2318 i = host.find('@') + 1 2319 host = host[i:] 2320 user, passwd = self.get_user_passwd(host, realm, i) 2321 if not (user or passwd): return None 2322 host = "%s:%s@%s" % (quote(user, safe=''), 2323 quote(passwd, safe=''), host) 2324 newurl = 'https://' + host + selector 2325 if data is None: 2326 return self.open(newurl) 2327 else: 2328 return self.open(newurl, data) 2329 2330 def get_user_passwd(self, host, realm, clear_cache=0): 2331 key = realm + '@' + host.lower() 2332 if key in self.auth_cache: 2333 if clear_cache: 2334 del self.auth_cache[key] 2335 else: 2336 return self.auth_cache[key] 2337 user, passwd = self.prompt_user_passwd(host, realm) 2338 if user or passwd: self.auth_cache[key] = (user, passwd) 2339 return user, passwd 2340 2341 def prompt_user_passwd(self, host, realm): 2342 """Override this in a GUI environment!""" 2343 import getpass 2344 try: 2345 user = input("Enter username for %s at %s: " % (realm, host)) 2346 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2347 (user, realm, host)) 2348 return user, passwd 2349 except KeyboardInterrupt: 2350 print() 2351 return None, None 2352 2353 2354# Utility functions 2355 2356_localhost = None 2357def localhost(): 2358 """Return the IP address of the magic hostname 'localhost'.""" 2359 global _localhost 2360 if _localhost is None: 2361 _localhost = socket.gethostbyname('localhost') 2362 return _localhost 2363 2364_thishost = None 2365def thishost(): 2366 """Return the IP addresses of the current host.""" 2367 global _thishost 2368 if _thishost is None: 2369 try: 2370 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2371 except socket.gaierror: 2372 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2373 return _thishost 2374 2375_ftperrors = None 2376def ftperrors(): 2377 """Return the set of errors raised by the FTP class.""" 2378 global _ftperrors 2379 if _ftperrors is None: 2380 import ftplib 2381 _ftperrors = ftplib.all_errors 2382 return _ftperrors 2383 2384_noheaders = None 2385def noheaders(): 2386 """Return an empty email Message object.""" 2387 global _noheaders 2388 if _noheaders is None: 2389 _noheaders = email.message_from_string("") 2390 return _noheaders 2391 2392 2393# Utility classes 2394 2395class ftpwrapper: 2396 """Class used by open_ftp() for cache of open FTP connections.""" 2397 2398 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2399 persistent=True): 2400 self.user = user 2401 self.passwd = passwd 2402 self.host = host 2403 self.port = port 2404 self.dirs = dirs 2405 self.timeout = timeout 2406 self.refcount = 0 2407 self.keepalive = persistent 2408 try: 2409 self.init() 2410 except: 2411 self.close() 2412 raise 2413 2414 def init(self): 2415 import ftplib 2416 self.busy = 0 2417 self.ftp = ftplib.FTP() 2418 self.ftp.connect(self.host, self.port, self.timeout) 2419 self.ftp.login(self.user, self.passwd) 2420 _target = '/'.join(self.dirs) 2421 self.ftp.cwd(_target) 2422 2423 def retrfile(self, file, type): 2424 import ftplib 2425 self.endtransfer() 2426 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2427 else: cmd = 'TYPE ' + type; isdir = 0 2428 try: 2429 self.ftp.voidcmd(cmd) 2430 except ftplib.all_errors: 2431 self.init() 2432 self.ftp.voidcmd(cmd) 2433 conn = None 2434 if file and not isdir: 2435 # Try to retrieve as a file 2436 try: 2437 cmd = 'RETR ' + file 2438 conn, retrlen = self.ftp.ntransfercmd(cmd) 2439 except ftplib.error_perm as reason: 2440 if str(reason)[:3] != '550': 2441 raise URLError(f'ftp error: {reason}') from reason 2442 if not conn: 2443 # Set transfer mode to ASCII! 2444 self.ftp.voidcmd('TYPE A') 2445 # Try a directory listing. Verify that directory exists. 2446 if file: 2447 pwd = self.ftp.pwd() 2448 try: 2449 try: 2450 self.ftp.cwd(file) 2451 except ftplib.error_perm as reason: 2452 raise URLError('ftp error: %r' % reason) from reason 2453 finally: 2454 self.ftp.cwd(pwd) 2455 cmd = 'LIST ' + file 2456 else: 2457 cmd = 'LIST' 2458 conn, retrlen = self.ftp.ntransfercmd(cmd) 2459 self.busy = 1 2460 2461 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2462 self.refcount += 1 2463 conn.close() 2464 # Pass back both a suitably decorated object and a retrieval length 2465 return (ftpobj, retrlen) 2466 2467 def endtransfer(self): 2468 if not self.busy: 2469 return 2470 self.busy = 0 2471 try: 2472 self.ftp.voidresp() 2473 except ftperrors(): 2474 pass 2475 2476 def close(self): 2477 self.keepalive = False 2478 if self.refcount <= 0: 2479 self.real_close() 2480 2481 def file_close(self): 2482 self.endtransfer() 2483 self.refcount -= 1 2484 if self.refcount <= 0 and not self.keepalive: 2485 self.real_close() 2486 2487 def real_close(self): 2488 self.endtransfer() 2489 try: 2490 self.ftp.close() 2491 except ftperrors(): 2492 pass 2493 2494# Proxy handling 2495def getproxies_environment(): 2496 """Return a dictionary of scheme -> proxy server URL mappings. 2497 2498 Scan the environment for variables named <scheme>_proxy; 2499 this seems to be the standard convention. If you need a 2500 different way, you can pass a proxies dictionary to the 2501 [Fancy]URLopener constructor. 2502 """ 2503 # in order to prefer lowercase variables, process environment in 2504 # two passes: first matches any, second pass matches lowercase only 2505 2506 # select only environment variables which end in (after making lowercase) _proxy 2507 proxies = {} 2508 environment = [] 2509 for name in os.environ: 2510 # fast screen underscore position before more expensive case-folding 2511 if len(name) > 5 and name[-6] == "_" and name[-5:].lower() == "proxy": 2512 value = os.environ[name] 2513 proxy_name = name[:-6].lower() 2514 environment.append((name, value, proxy_name)) 2515 if value: 2516 proxies[proxy_name] = value 2517 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2518 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2519 # header from the client 2520 # If "proxy" is lowercase, it will still be used thanks to the next block 2521 if 'REQUEST_METHOD' in os.environ: 2522 proxies.pop('http', None) 2523 for name, value, proxy_name in environment: 2524 # not case-folded, checking here for lower-case env vars only 2525 if name[-6:] == '_proxy': 2526 if value: 2527 proxies[proxy_name] = value 2528 else: 2529 proxies.pop(proxy_name, None) 2530 return proxies 2531 2532def proxy_bypass_environment(host, proxies=None): 2533 """Test if proxies should not be used for a particular host. 2534 2535 Checks the proxy dict for the value of no_proxy, which should 2536 be a list of comma separated DNS suffixes, or '*' for all hosts. 2537 2538 """ 2539 if proxies is None: 2540 proxies = getproxies_environment() 2541 # don't bypass, if no_proxy isn't specified 2542 try: 2543 no_proxy = proxies['no'] 2544 except KeyError: 2545 return False 2546 # '*' is special case for always bypass 2547 if no_proxy == '*': 2548 return True 2549 host = host.lower() 2550 # strip port off host 2551 hostonly, port = _splitport(host) 2552 # check if the host ends with any of the DNS suffixes 2553 for name in no_proxy.split(','): 2554 name = name.strip() 2555 if name: 2556 name = name.lstrip('.') # ignore leading dots 2557 name = name.lower() 2558 if hostonly == name or host == name: 2559 return True 2560 name = '.' + name 2561 if hostonly.endswith(name) or host.endswith(name): 2562 return True 2563 # otherwise, don't bypass 2564 return False 2565 2566 2567# This code tests an OSX specific data structure but is testable on all 2568# platforms 2569def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2570 """ 2571 Return True iff this host shouldn't be accessed using a proxy 2572 2573 This function uses the MacOSX framework SystemConfiguration 2574 to fetch the proxy information. 2575 2576 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2577 { 'exclude_simple': bool, 2578 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2579 } 2580 """ 2581 from fnmatch import fnmatch 2582 from ipaddress import AddressValueError, IPv4Address 2583 2584 hostonly, port = _splitport(host) 2585 2586 def ip2num(ipAddr): 2587 parts = ipAddr.split('.') 2588 parts = list(map(int, parts)) 2589 if len(parts) != 4: 2590 parts = (parts + [0, 0, 0, 0])[:4] 2591 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2592 2593 # Check for simple host names: 2594 if '.' not in host: 2595 if proxy_settings['exclude_simple']: 2596 return True 2597 2598 hostIP = None 2599 try: 2600 hostIP = int(IPv4Address(hostonly)) 2601 except AddressValueError: 2602 pass 2603 2604 for value in proxy_settings.get('exceptions', ()): 2605 # Items in the list are strings like these: *.local, 169.254/16 2606 if not value: continue 2607 2608 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2609 if m is not None and hostIP is not None: 2610 base = ip2num(m.group(1)) 2611 mask = m.group(2) 2612 if mask is None: 2613 mask = 8 * (m.group(1).count('.') + 1) 2614 else: 2615 mask = int(mask[1:]) 2616 2617 if mask < 0 or mask > 32: 2618 # System libraries ignore invalid prefix lengths 2619 continue 2620 2621 mask = 32 - mask 2622 2623 if (hostIP >> mask) == (base >> mask): 2624 return True 2625 2626 elif fnmatch(host, value): 2627 return True 2628 2629 return False 2630 2631 2632# Same as _proxy_bypass_macosx_sysconf, testable on all platforms 2633def _proxy_bypass_winreg_override(host, override): 2634 """Return True if the host should bypass the proxy server. 2635 2636 The proxy override list is obtained from the Windows 2637 Internet settings proxy override registry value. 2638 2639 An example of a proxy override value is: 2640 "www.example.com;*.example.net; 192.168.0.1" 2641 """ 2642 from fnmatch import fnmatch 2643 2644 host, _ = _splitport(host) 2645 proxy_override = override.split(';') 2646 for test in proxy_override: 2647 test = test.strip() 2648 # "<local>" should bypass the proxy server for all intranet addresses 2649 if test == '<local>': 2650 if '.' not in host: 2651 return True 2652 elif fnmatch(host, test): 2653 return True 2654 return False 2655 2656 2657if sys.platform == 'darwin': 2658 from _scproxy import _get_proxy_settings, _get_proxies 2659 2660 def proxy_bypass_macosx_sysconf(host): 2661 proxy_settings = _get_proxy_settings() 2662 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2663 2664 def getproxies_macosx_sysconf(): 2665 """Return a dictionary of scheme -> proxy server URL mappings. 2666 2667 This function uses the MacOSX framework SystemConfiguration 2668 to fetch the proxy information. 2669 """ 2670 return _get_proxies() 2671 2672 2673 2674 def proxy_bypass(host): 2675 """Return True, if host should be bypassed. 2676 2677 Checks proxy settings gathered from the environment, if specified, 2678 or from the MacOSX framework SystemConfiguration. 2679 2680 """ 2681 proxies = getproxies_environment() 2682 if proxies: 2683 return proxy_bypass_environment(host, proxies) 2684 else: 2685 return proxy_bypass_macosx_sysconf(host) 2686 2687 def getproxies(): 2688 return getproxies_environment() or getproxies_macosx_sysconf() 2689 2690 2691elif os.name == 'nt': 2692 def getproxies_registry(): 2693 """Return a dictionary of scheme -> proxy server URL mappings. 2694 2695 Win32 uses the registry to store proxies. 2696 2697 """ 2698 proxies = {} 2699 try: 2700 import winreg 2701 except ImportError: 2702 # Std module, so should be around - but you never know! 2703 return proxies 2704 try: 2705 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2706 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2707 proxyEnable = winreg.QueryValueEx(internetSettings, 2708 'ProxyEnable')[0] 2709 if proxyEnable: 2710 # Returned as Unicode but problems if not converted to ASCII 2711 proxyServer = str(winreg.QueryValueEx(internetSettings, 2712 'ProxyServer')[0]) 2713 if '=' not in proxyServer and ';' not in proxyServer: 2714 # Use one setting for all protocols. 2715 proxyServer = 'http={0};https={0};ftp={0}'.format(proxyServer) 2716 for p in proxyServer.split(';'): 2717 protocol, address = p.split('=', 1) 2718 # See if address has a type:// prefix 2719 if not re.match('(?:[^/:]+)://', address): 2720 # Add type:// prefix to address without specifying type 2721 if protocol in ('http', 'https', 'ftp'): 2722 # The default proxy type of Windows is HTTP 2723 address = 'http://' + address 2724 elif protocol == 'socks': 2725 address = 'socks://' + address 2726 proxies[protocol] = address 2727 # Use SOCKS proxy for HTTP(S) protocols 2728 if proxies.get('socks'): 2729 # The default SOCKS proxy type of Windows is SOCKS4 2730 address = re.sub(r'^socks://', 'socks4://', proxies['socks']) 2731 proxies['http'] = proxies.get('http') or address 2732 proxies['https'] = proxies.get('https') or address 2733 internetSettings.Close() 2734 except (OSError, ValueError, TypeError): 2735 # Either registry key not found etc, or the value in an 2736 # unexpected format. 2737 # proxies already set up to be empty so nothing to do 2738 pass 2739 return proxies 2740 2741 def getproxies(): 2742 """Return a dictionary of scheme -> proxy server URL mappings. 2743 2744 Returns settings gathered from the environment, if specified, 2745 or the registry. 2746 2747 """ 2748 return getproxies_environment() or getproxies_registry() 2749 2750 def proxy_bypass_registry(host): 2751 try: 2752 import winreg 2753 except ImportError: 2754 # Std modules, so should be around - but you never know! 2755 return False 2756 try: 2757 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2758 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2759 proxyEnable = winreg.QueryValueEx(internetSettings, 2760 'ProxyEnable')[0] 2761 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2762 'ProxyOverride')[0]) 2763 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2764 except OSError: 2765 return False 2766 if not proxyEnable or not proxyOverride: 2767 return False 2768 return _proxy_bypass_winreg_override(host, proxyOverride) 2769 2770 def proxy_bypass(host): 2771 """Return True, if host should be bypassed. 2772 2773 Checks proxy settings gathered from the environment, if specified, 2774 or the registry. 2775 2776 """ 2777 proxies = getproxies_environment() 2778 if proxies: 2779 return proxy_bypass_environment(host, proxies) 2780 else: 2781 return proxy_bypass_registry(host) 2782 2783else: 2784 # By default use environment variables 2785 getproxies = getproxies_environment 2786 proxy_bypass = proxy_bypass_environment 2787