1r"""HTTP/1.1 client library 2 3<intro stuff goes here> 4<other stuff, too> 5 6HTTPConnection goes through a number of "states", which define when a client 7may legally make another request or fetch the response for a particular 8request. This diagram details these state transitions: 9 10 (null) 11 | 12 | HTTPConnection() 13 v 14 Idle 15 | 16 | putrequest() 17 v 18 Request-started 19 | 20 | ( putheader() )* endheaders() 21 v 22 Request-sent 23 |\_____________________________ 24 | | getresponse() raises 25 | response = getresponse() | ConnectionError 26 v v 27 Unread-response Idle 28 [Response-headers-read] 29 |\____________________ 30 | | 31 | response.read() | putrequest() 32 v v 33 Idle Req-started-unread-response 34 ______/| 35 / | 36 response.read() | | ( putheader() )* endheaders() 37 v v 38 Request-started Req-sent-unread-response 39 | 40 | response.read() 41 v 42 Request-sent 43 44This diagram presents the following rules: 45 -- a second request may not be started until {response-headers-read} 46 -- a response [object] cannot be retrieved until {request-sent} 47 -- there is no differentiation between an unread response body and a 48 partially read response body 49 50Note: this enforcement is applied by the HTTPConnection class. The 51 HTTPResponse class does not enforce this state machine, which 52 implies sophisticated clients may accelerate the request/response 53 pipeline. Caution should be taken, though: accelerating the states 54 beyond the above pattern may imply knowledge of the server's 55 connection-close behavior for certain requests. For example, it 56 is impossible to tell whether the server will close the connection 57 UNTIL the response headers have been read; this means that further 58 requests cannot be placed into the pipeline until it is known that 59 the server will NOT be closing the connection. 60 61Logical State __state __response 62------------- ------- ---------- 63Idle _CS_IDLE None 64Request-started _CS_REQ_STARTED None 65Request-sent _CS_REQ_SENT None 66Unread-response _CS_IDLE <response_class> 67Req-started-unread-response _CS_REQ_STARTED <response_class> 68Req-sent-unread-response _CS_REQ_SENT <response_class> 69""" 70 71import email.parser 72import email.message 73import http 74import io 75import re 76import socket 77import collections.abc 78from urllib.parse import urlsplit 79 80# HTTPMessage, parse_headers(), and the HTTP status code constants are 81# intentionally omitted for simplicity 82__all__ = ["HTTPResponse", "HTTPConnection", 83 "HTTPException", "NotConnected", "UnknownProtocol", 84 "UnknownTransferEncoding", "UnimplementedFileMode", 85 "IncompleteRead", "InvalidURL", "ImproperConnectionState", 86 "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", 87 "BadStatusLine", "LineTooLong", "RemoteDisconnected", "error", 88 "responses"] 89 90HTTP_PORT = 80 91HTTPS_PORT = 443 92 93_UNKNOWN = 'UNKNOWN' 94 95# connection states 96_CS_IDLE = 'Idle' 97_CS_REQ_STARTED = 'Request-started' 98_CS_REQ_SENT = 'Request-sent' 99 100 101# hack to maintain backwards compatibility 102globals().update(http.HTTPStatus.__members__) 103 104# another hack to maintain backwards compatibility 105# Mapping status codes to official W3C names 106responses = {v: v.phrase for v in http.HTTPStatus.__members__.values()} 107 108# maximal line length when calling readline(). 109_MAXLINE = 65536 110_MAXHEADERS = 100 111 112# Header name/value ABNF (http://tools.ietf.org/html/rfc7230#section-3.2) 113# 114# VCHAR = %x21-7E 115# obs-text = %x80-FF 116# header-field = field-name ":" OWS field-value OWS 117# field-name = token 118# field-value = *( field-content / obs-fold ) 119# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] 120# field-vchar = VCHAR / obs-text 121# 122# obs-fold = CRLF 1*( SP / HTAB ) 123# ; obsolete line folding 124# ; see Section 3.2.4 125 126# token = 1*tchar 127# 128# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" 129# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" 130# / DIGIT / ALPHA 131# ; any VCHAR, except delimiters 132# 133# VCHAR defined in http://tools.ietf.org/html/rfc5234#appendix-B.1 134 135# the patterns for both name and value are more lenient than RFC 136# definitions to allow for backwards compatibility 137_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch 138_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search 139 140# These characters are not allowed within HTTP URL paths. 141# See https://tools.ietf.org/html/rfc3986#section-3.3 and the 142# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition. 143# Prevents CVE-2019-9740. Includes control characters such as \r\n. 144# We don't restrict chars above \x7f as putrequest() limits us to ASCII. 145_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]') 146# Arguably only these _should_ allowed: 147# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") 148# We are more lenient for assumed real world compatibility purposes. 149 150# These characters are not allowed within HTTP method names 151# to prevent http header injection. 152_contains_disallowed_method_pchar_re = re.compile('[\x00-\x1f]') 153 154# We always set the Content-Length header for these methods because some 155# servers will otherwise respond with a 411 156_METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} 157 158 159def _encode(data, name='data'): 160 """Call data.encode("latin-1") but show a better error message.""" 161 try: 162 return data.encode("latin-1") 163 except UnicodeEncodeError as err: 164 raise UnicodeEncodeError( 165 err.encoding, 166 err.object, 167 err.start, 168 err.end, 169 "%s (%.20r) is not valid Latin-1. Use %s.encode('utf-8') " 170 "if you want to send it encoded in UTF-8." % 171 (name.title(), data[err.start:err.end], name)) from None 172 173 174class HTTPMessage(email.message.Message): 175 # XXX The only usage of this method is in 176 # http.server.CGIHTTPRequestHandler. Maybe move the code there so 177 # that it doesn't need to be part of the public API. The API has 178 # never been defined so this could cause backwards compatibility 179 # issues. 180 181 def getallmatchingheaders(self, name): 182 """Find all header lines matching a given header name. 183 184 Look through the list of headers and find all lines matching a given 185 header name (and their continuation lines). A list of the lines is 186 returned, without interpretation. If the header does not occur, an 187 empty list is returned. If the header occurs multiple times, all 188 occurrences are returned. Case is not important in the header name. 189 190 """ 191 name = name.lower() + ':' 192 n = len(name) 193 lst = [] 194 hit = 0 195 for line in self.keys(): 196 if line[:n].lower() == name: 197 hit = 1 198 elif not line[:1].isspace(): 199 hit = 0 200 if hit: 201 lst.append(line) 202 return lst 203 204def _read_headers(fp): 205 """Reads potential header lines into a list from a file pointer. 206 207 Length of line is limited by _MAXLINE, and number of 208 headers is limited by _MAXHEADERS. 209 """ 210 headers = [] 211 while True: 212 line = fp.readline(_MAXLINE + 1) 213 if len(line) > _MAXLINE: 214 raise LineTooLong("header line") 215 headers.append(line) 216 if len(headers) > _MAXHEADERS: 217 raise HTTPException("got more than %d headers" % _MAXHEADERS) 218 if line in (b'\r\n', b'\n', b''): 219 break 220 return headers 221 222def parse_headers(fp, _class=HTTPMessage): 223 """Parses only RFC2822 headers from a file pointer. 224 225 email Parser wants to see strings rather than bytes. 226 But a TextIOWrapper around self.rfile would buffer too many bytes 227 from the stream, bytes which we later need to read as bytes. 228 So we read the correct bytes here, as bytes, for email Parser 229 to parse. 230 231 """ 232 headers = _read_headers(fp) 233 hstring = b''.join(headers).decode('iso-8859-1') 234 return email.parser.Parser(_class=_class).parsestr(hstring) 235 236 237class HTTPResponse(io.BufferedIOBase): 238 239 # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. 240 241 # The bytes from the socket object are iso-8859-1 strings. 242 # See RFC 2616 sec 2.2 which notes an exception for MIME-encoded 243 # text following RFC 2047. The basic status line parsing only 244 # accepts iso-8859-1. 245 246 def __init__(self, sock, debuglevel=0, method=None, url=None): 247 # If the response includes a content-length header, we need to 248 # make sure that the client doesn't read more than the 249 # specified number of bytes. If it does, it will block until 250 # the server times out and closes the connection. This will 251 # happen if a self.fp.read() is done (without a size) whether 252 # self.fp is buffered or not. So, no self.fp.read() by 253 # clients unless they know what they are doing. 254 self.fp = sock.makefile("rb") 255 self.debuglevel = debuglevel 256 self._method = method 257 258 # The HTTPResponse object is returned via urllib. The clients 259 # of http and urllib expect different attributes for the 260 # headers. headers is used here and supports urllib. msg is 261 # provided as a backwards compatibility layer for http 262 # clients. 263 264 self.headers = self.msg = None 265 266 # from the Status-Line of the response 267 self.version = _UNKNOWN # HTTP-Version 268 self.status = _UNKNOWN # Status-Code 269 self.reason = _UNKNOWN # Reason-Phrase 270 271 self.chunked = _UNKNOWN # is "chunked" being used? 272 self.chunk_left = _UNKNOWN # bytes left to read in current chunk 273 self.length = _UNKNOWN # number of bytes left in response 274 self.will_close = _UNKNOWN # conn will close at end of response 275 276 def _read_status(self): 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 278 if len(line) > _MAXLINE: 279 raise LineTooLong("status line") 280 if self.debuglevel > 0: 281 print("reply:", repr(line)) 282 if not line: 283 # Presumably, the server closed the connection before 284 # sending a valid response. 285 raise RemoteDisconnected("Remote end closed connection without" 286 " response") 287 try: 288 version, status, reason = line.split(None, 2) 289 except ValueError: 290 try: 291 version, status = line.split(None, 1) 292 reason = "" 293 except ValueError: 294 # empty version will cause next test to fail. 295 version = "" 296 if not version.startswith("HTTP/"): 297 self._close_conn() 298 raise BadStatusLine(line) 299 300 # The status code is a three-digit number 301 try: 302 status = int(status) 303 if status < 100 or status > 999: 304 raise BadStatusLine(line) 305 except ValueError: 306 raise BadStatusLine(line) 307 return version, status, reason 308 309 def begin(self): 310 if self.headers is not None: 311 # we've already started reading the response 312 return 313 314 # read until we get a non-100 response 315 while True: 316 version, status, reason = self._read_status() 317 if status != CONTINUE: 318 break 319 # skip the header from the 100 response 320 skipped_headers = _read_headers(self.fp) 321 if self.debuglevel > 0: 322 print("headers:", skipped_headers) 323 del skipped_headers 324 325 self.code = self.status = status 326 self.reason = reason.strip() 327 if version in ("HTTP/1.0", "HTTP/0.9"): 328 # Some servers might still return "0.9", treat it as 1.0 anyway 329 self.version = 10 330 elif version.startswith("HTTP/1."): 331 self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 332 else: 333 raise UnknownProtocol(version) 334 335 self.headers = self.msg = parse_headers(self.fp) 336 337 if self.debuglevel > 0: 338 for hdr, val in self.headers.items(): 339 print("header:", hdr + ":", val) 340 341 # are we using the chunked-style of transfer encoding? 342 tr_enc = self.headers.get("transfer-encoding") 343 if tr_enc and tr_enc.lower() == "chunked": 344 self.chunked = True 345 self.chunk_left = None 346 else: 347 self.chunked = False 348 349 # will the connection close at the end of the response? 350 self.will_close = self._check_close() 351 352 # do we have a Content-Length? 353 # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" 354 self.length = None 355 length = self.headers.get("content-length") 356 357 # are we using the chunked-style of transfer encoding? 358 tr_enc = self.headers.get("transfer-encoding") 359 if length and not self.chunked: 360 try: 361 self.length = int(length) 362 except ValueError: 363 self.length = None 364 else: 365 if self.length < 0: # ignore nonsensical negative lengths 366 self.length = None 367 else: 368 self.length = None 369 370 # does the body have a fixed length? (of zero) 371 if (status == NO_CONTENT or status == NOT_MODIFIED or 372 100 <= status < 200 or # 1xx codes 373 self._method == "HEAD"): 374 self.length = 0 375 376 # if the connection remains open, and we aren't using chunked, and 377 # a content-length was not provided, then assume that the connection 378 # WILL close. 379 if (not self.will_close and 380 not self.chunked and 381 self.length is None): 382 self.will_close = True 383 384 def _check_close(self): 385 conn = self.headers.get("connection") 386 if self.version == 11: 387 # An HTTP/1.1 proxy is assumed to stay open unless 388 # explicitly closed. 389 if conn and "close" in conn.lower(): 390 return True 391 return False 392 393 # Some HTTP/1.0 implementations have support for persistent 394 # connections, using rules different than HTTP/1.1. 395 396 # For older HTTP, Keep-Alive indicates persistent connection. 397 if self.headers.get("keep-alive"): 398 return False 399 400 # At least Akamai returns a "Connection: Keep-Alive" header, 401 # which was supposed to be sent by the client. 402 if conn and "keep-alive" in conn.lower(): 403 return False 404 405 # Proxy-Connection is a netscape hack. 406 pconn = self.headers.get("proxy-connection") 407 if pconn and "keep-alive" in pconn.lower(): 408 return False 409 410 # otherwise, assume it will close 411 return True 412 413 def _close_conn(self): 414 fp = self.fp 415 self.fp = None 416 fp.close() 417 418 def close(self): 419 try: 420 super().close() # set "closed" flag 421 finally: 422 if self.fp: 423 self._close_conn() 424 425 # These implementations are for the benefit of io.BufferedReader. 426 427 # XXX This class should probably be revised to act more like 428 # the "raw stream" that BufferedReader expects. 429 430 def flush(self): 431 super().flush() 432 if self.fp: 433 self.fp.flush() 434 435 def readable(self): 436 """Always returns True""" 437 return True 438 439 # End of "raw stream" methods 440 441 def isclosed(self): 442 """True if the connection is closed.""" 443 # NOTE: it is possible that we will not ever call self.close(). This 444 # case occurs when will_close is TRUE, length is None, and we 445 # read up to the last byte, but NOT past it. 446 # 447 # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be 448 # called, meaning self.isclosed() is meaningful. 449 return self.fp is None 450 451 def read(self, amt=None): 452 if self.fp is None: 453 return b"" 454 455 if self._method == "HEAD": 456 self._close_conn() 457 return b"" 458 459 if amt is not None: 460 # Amount is given, implement using readinto 461 b = bytearray(amt) 462 n = self.readinto(b) 463 return memoryview(b)[:n].tobytes() 464 else: 465 # Amount is not given (unbounded read) so we must check self.length 466 # and self.chunked 467 468 if self.chunked: 469 return self._readall_chunked() 470 471 if self.length is None: 472 s = self.fp.read() 473 else: 474 try: 475 s = self._safe_read(self.length) 476 except IncompleteRead: 477 self._close_conn() 478 raise 479 self.length = 0 480 self._close_conn() # we read everything 481 return s 482 483 def readinto(self, b): 484 """Read up to len(b) bytes into bytearray b and return the number 485 of bytes read. 486 """ 487 488 if self.fp is None: 489 return 0 490 491 if self._method == "HEAD": 492 self._close_conn() 493 return 0 494 495 if self.chunked: 496 return self._readinto_chunked(b) 497 498 if self.length is not None: 499 if len(b) > self.length: 500 # clip the read to the "end of response" 501 b = memoryview(b)[0:self.length] 502 503 # we do not use _safe_read() here because this may be a .will_close 504 # connection, and the user is reading more bytes than will be provided 505 # (for example, reading in 1k chunks) 506 n = self.fp.readinto(b) 507 if not n and b: 508 # Ideally, we would raise IncompleteRead if the content-length 509 # wasn't satisfied, but it might break compatibility. 510 self._close_conn() 511 elif self.length is not None: 512 self.length -= n 513 if not self.length: 514 self._close_conn() 515 return n 516 517 def _read_next_chunk_size(self): 518 # Read the next chunk size from the file 519 line = self.fp.readline(_MAXLINE + 1) 520 if len(line) > _MAXLINE: 521 raise LineTooLong("chunk size") 522 i = line.find(b";") 523 if i >= 0: 524 line = line[:i] # strip chunk-extensions 525 try: 526 return int(line, 16) 527 except ValueError: 528 # close the connection as protocol synchronisation is 529 # probably lost 530 self._close_conn() 531 raise 532 533 def _read_and_discard_trailer(self): 534 # read and discard trailer up to the CRLF terminator 535 ### note: we shouldn't have any trailers! 536 while True: 537 line = self.fp.readline(_MAXLINE + 1) 538 if len(line) > _MAXLINE: 539 raise LineTooLong("trailer line") 540 if not line: 541 # a vanishingly small number of sites EOF without 542 # sending the trailer 543 break 544 if line in (b'\r\n', b'\n', b''): 545 break 546 547 def _get_chunk_left(self): 548 # return self.chunk_left, reading a new chunk if necessary. 549 # chunk_left == 0: at the end of the current chunk, need to close it 550 # chunk_left == None: No current chunk, should read next. 551 # This function returns non-zero or None if the last chunk has 552 # been read. 553 chunk_left = self.chunk_left 554 if not chunk_left: # Can be 0 or None 555 if chunk_left is not None: 556 # We are at the end of chunk, discard chunk end 557 self._safe_read(2) # toss the CRLF at the end of the chunk 558 try: 559 chunk_left = self._read_next_chunk_size() 560 except ValueError: 561 raise IncompleteRead(b'') 562 if chunk_left == 0: 563 # last chunk: 1*("0") [ chunk-extension ] CRLF 564 self._read_and_discard_trailer() 565 # we read everything; close the "file" 566 self._close_conn() 567 chunk_left = None 568 self.chunk_left = chunk_left 569 return chunk_left 570 571 def _readall_chunked(self): 572 assert self.chunked != _UNKNOWN 573 value = [] 574 try: 575 while True: 576 chunk_left = self._get_chunk_left() 577 if chunk_left is None: 578 break 579 value.append(self._safe_read(chunk_left)) 580 self.chunk_left = 0 581 return b''.join(value) 582 except IncompleteRead: 583 raise IncompleteRead(b''.join(value)) 584 585 def _readinto_chunked(self, b): 586 assert self.chunked != _UNKNOWN 587 total_bytes = 0 588 mvb = memoryview(b) 589 try: 590 while True: 591 chunk_left = self._get_chunk_left() 592 if chunk_left is None: 593 return total_bytes 594 595 if len(mvb) <= chunk_left: 596 n = self._safe_readinto(mvb) 597 self.chunk_left = chunk_left - n 598 return total_bytes + n 599 600 temp_mvb = mvb[:chunk_left] 601 n = self._safe_readinto(temp_mvb) 602 mvb = mvb[n:] 603 total_bytes += n 604 self.chunk_left = 0 605 606 except IncompleteRead: 607 raise IncompleteRead(bytes(b[0:total_bytes])) 608 609 def _safe_read(self, amt): 610 """Read the number of bytes requested. 611 612 This function should be used when <amt> bytes "should" be present for 613 reading. If the bytes are truly not available (due to EOF), then the 614 IncompleteRead exception can be used to detect the problem. 615 """ 616 data = self.fp.read(amt) 617 if len(data) < amt: 618 raise IncompleteRead(data, amt-len(data)) 619 return data 620 621 def _safe_readinto(self, b): 622 """Same as _safe_read, but for reading into a buffer.""" 623 amt = len(b) 624 n = self.fp.readinto(b) 625 if n < amt: 626 raise IncompleteRead(bytes(b[:n]), amt-n) 627 return n 628 629 def read1(self, n=-1): 630 """Read with at most one underlying system call. If at least one 631 byte is buffered, return that instead. 632 """ 633 if self.fp is None or self._method == "HEAD": 634 return b"" 635 if self.chunked: 636 return self._read1_chunked(n) 637 if self.length is not None and (n < 0 or n > self.length): 638 n = self.length 639 result = self.fp.read1(n) 640 if not result and n: 641 self._close_conn() 642 elif self.length is not None: 643 self.length -= len(result) 644 return result 645 646 def peek(self, n=-1): 647 # Having this enables IOBase.readline() to read more than one 648 # byte at a time 649 if self.fp is None or self._method == "HEAD": 650 return b"" 651 if self.chunked: 652 return self._peek_chunked(n) 653 return self.fp.peek(n) 654 655 def readline(self, limit=-1): 656 if self.fp is None or self._method == "HEAD": 657 return b"" 658 if self.chunked: 659 # Fallback to IOBase readline which uses peek() and read() 660 return super().readline(limit) 661 if self.length is not None and (limit < 0 or limit > self.length): 662 limit = self.length 663 result = self.fp.readline(limit) 664 if not result and limit: 665 self._close_conn() 666 elif self.length is not None: 667 self.length -= len(result) 668 return result 669 670 def _read1_chunked(self, n): 671 # Strictly speaking, _get_chunk_left() may cause more than one read, 672 # but that is ok, since that is to satisfy the chunked protocol. 673 chunk_left = self._get_chunk_left() 674 if chunk_left is None or n == 0: 675 return b'' 676 if not (0 <= n <= chunk_left): 677 n = chunk_left # if n is negative or larger than chunk_left 678 read = self.fp.read1(n) 679 self.chunk_left -= len(read) 680 if not read: 681 raise IncompleteRead(b"") 682 return read 683 684 def _peek_chunked(self, n): 685 # Strictly speaking, _get_chunk_left() may cause more than one read, 686 # but that is ok, since that is to satisfy the chunked protocol. 687 try: 688 chunk_left = self._get_chunk_left() 689 except IncompleteRead: 690 return b'' # peek doesn't worry about protocol 691 if chunk_left is None: 692 return b'' # eof 693 # peek is allowed to return more than requested. Just request the 694 # entire chunk, and truncate what we get. 695 return self.fp.peek(chunk_left)[:chunk_left] 696 697 def fileno(self): 698 return self.fp.fileno() 699 700 def getheader(self, name, default=None): 701 '''Returns the value of the header matching *name*. 702 703 If there are multiple matching headers, the values are 704 combined into a single string separated by commas and spaces. 705 706 If no matching header is found, returns *default* or None if 707 the *default* is not specified. 708 709 If the headers are unknown, raises http.client.ResponseNotReady. 710 711 ''' 712 if self.headers is None: 713 raise ResponseNotReady() 714 headers = self.headers.get_all(name) or default 715 if isinstance(headers, str) or not hasattr(headers, '__iter__'): 716 return headers 717 else: 718 return ', '.join(headers) 719 720 def getheaders(self): 721 """Return list of (header, value) tuples.""" 722 if self.headers is None: 723 raise ResponseNotReady() 724 return list(self.headers.items()) 725 726 # We override IOBase.__iter__ so that it doesn't check for closed-ness 727 728 def __iter__(self): 729 return self 730 731 # For compatibility with old-style urllib responses. 732 733 def info(self): 734 '''Returns an instance of the class mimetools.Message containing 735 meta-information associated with the URL. 736 737 When the method is HTTP, these headers are those returned by 738 the server at the head of the retrieved HTML page (including 739 Content-Length and Content-Type). 740 741 When the method is FTP, a Content-Length header will be 742 present if (as is now usual) the server passed back a file 743 length in response to the FTP retrieval request. A 744 Content-Type header will be present if the MIME type can be 745 guessed. 746 747 When the method is local-file, returned headers will include 748 a Date representing the file's last-modified time, a 749 Content-Length giving file size, and a Content-Type 750 containing a guess at the file's type. See also the 751 description of the mimetools module. 752 753 ''' 754 return self.headers 755 756 def geturl(self): 757 '''Return the real URL of the page. 758 759 In some cases, the HTTP server redirects a client to another 760 URL. The urlopen() function handles this transparently, but in 761 some cases the caller needs to know which URL the client was 762 redirected to. The geturl() method can be used to get at this 763 redirected URL. 764 765 ''' 766 return self.url 767 768 def getcode(self): 769 '''Return the HTTP status code that was sent with the response, 770 or None if the URL is not an HTTP URL. 771 772 ''' 773 return self.status 774 775class HTTPConnection: 776 777 _http_vsn = 11 778 _http_vsn_str = 'HTTP/1.1' 779 780 response_class = HTTPResponse 781 default_port = HTTP_PORT 782 auto_open = 1 783 debuglevel = 0 784 785 @staticmethod 786 def _is_textIO(stream): 787 """Test whether a file-like object is a text or a binary stream. 788 """ 789 return isinstance(stream, io.TextIOBase) 790 791 @staticmethod 792 def _get_content_length(body, method): 793 """Get the content-length based on the body. 794 795 If the body is None, we set Content-Length: 0 for methods that expect 796 a body (RFC 7230, Section 3.3.2). We also set the Content-Length for 797 any method if the body is a str or bytes-like object and not a file. 798 """ 799 if body is None: 800 # do an explicit check for not None here to distinguish 801 # between unset and set but empty 802 if method.upper() in _METHODS_EXPECTING_BODY: 803 return 0 804 else: 805 return None 806 807 if hasattr(body, 'read'): 808 # file-like object. 809 return None 810 811 try: 812 # does it implement the buffer protocol (bytes, bytearray, array)? 813 mv = memoryview(body) 814 return mv.nbytes 815 except TypeError: 816 pass 817 818 if isinstance(body, str): 819 return len(body) 820 821 return None 822 823 def __init__(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 824 source_address=None, blocksize=8192): 825 self.timeout = timeout 826 self.source_address = source_address 827 self.blocksize = blocksize 828 self.sock = None 829 self._buffer = [] 830 self.__response = None 831 self.__state = _CS_IDLE 832 self._method = None 833 self._tunnel_host = None 834 self._tunnel_port = None 835 self._tunnel_headers = {} 836 837 (self.host, self.port) = self._get_hostport(host, port) 838 839 self._validate_host(self.host) 840 841 # This is stored as an instance variable to allow unit 842 # tests to replace it with a suitable mockup 843 self._create_connection = socket.create_connection 844 845 def set_tunnel(self, host, port=None, headers=None): 846 """Set up host and port for HTTP CONNECT tunnelling. 847 848 In a connection that uses HTTP CONNECT tunneling, the host passed to the 849 constructor is used as a proxy server that relays all communication to 850 the endpoint passed to `set_tunnel`. This done by sending an HTTP 851 CONNECT request to the proxy server when the connection is established. 852 853 This method must be called before the HTTP connection has been 854 established. 855 856 The headers argument should be a mapping of extra HTTP headers to send 857 with the CONNECT request. 858 """ 859 860 if self.sock: 861 raise RuntimeError("Can't set up tunnel for established connection") 862 863 self._tunnel_host, self._tunnel_port = self._get_hostport(host, port) 864 if headers: 865 self._tunnel_headers = headers 866 else: 867 self._tunnel_headers.clear() 868 869 def _get_hostport(self, host, port): 870 if port is None: 871 i = host.rfind(':') 872 j = host.rfind(']') # ipv6 addresses have [...] 873 if i > j: 874 try: 875 port = int(host[i+1:]) 876 except ValueError: 877 if host[i+1:] == "": # http://foo.com:/ == http://foo.com/ 878 port = self.default_port 879 else: 880 raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) 881 host = host[:i] 882 else: 883 port = self.default_port 884 if host and host[0] == '[' and host[-1] == ']': 885 host = host[1:-1] 886 887 return (host, port) 888 889 def set_debuglevel(self, level): 890 self.debuglevel = level 891 892 def _tunnel(self): 893 connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (self._tunnel_host, 894 self._tunnel_port) 895 connect_bytes = connect_str.encode("ascii") 896 self.send(connect_bytes) 897 for header, value in self._tunnel_headers.items(): 898 header_str = "%s: %s\r\n" % (header, value) 899 header_bytes = header_str.encode("latin-1") 900 self.send(header_bytes) 901 self.send(b'\r\n') 902 903 response = self.response_class(self.sock, method=self._method) 904 (version, code, message) = response._read_status() 905 906 if code != http.HTTPStatus.OK: 907 self.close() 908 raise OSError("Tunnel connection failed: %d %s" % (code, 909 message.strip())) 910 while True: 911 line = response.fp.readline(_MAXLINE + 1) 912 if len(line) > _MAXLINE: 913 raise LineTooLong("header line") 914 if not line: 915 # for sites which EOF without sending a trailer 916 break 917 if line in (b'\r\n', b'\n', b''): 918 break 919 920 if self.debuglevel > 0: 921 print('header:', line.decode()) 922 923 def connect(self): 924 """Connect to the host and port specified in __init__.""" 925 self.sock = self._create_connection( 926 (self.host,self.port), self.timeout, self.source_address) 927 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 928 929 if self._tunnel_host: 930 self._tunnel() 931 932 def close(self): 933 """Close the connection to the HTTP server.""" 934 self.__state = _CS_IDLE 935 try: 936 sock = self.sock 937 if sock: 938 self.sock = None 939 sock.close() # close it manually... there may be other refs 940 finally: 941 response = self.__response 942 if response: 943 self.__response = None 944 response.close() 945 946 def send(self, data): 947 """Send `data' to the server. 948 ``data`` can be a string object, a bytes object, an array object, a 949 file-like object that supports a .read() method, or an iterable object. 950 """ 951 952 if self.sock is None: 953 if self.auto_open: 954 self.connect() 955 else: 956 raise NotConnected() 957 958 if self.debuglevel > 0: 959 print("send:", repr(data)) 960 if hasattr(data, "read") : 961 if self.debuglevel > 0: 962 print("sendIng a read()able") 963 encode = self._is_textIO(data) 964 if encode and self.debuglevel > 0: 965 print("encoding file using iso-8859-1") 966 while 1: 967 datablock = data.read(self.blocksize) 968 if not datablock: 969 break 970 if encode: 971 datablock = datablock.encode("iso-8859-1") 972 self.sock.sendall(datablock) 973 return 974 try: 975 self.sock.sendall(data) 976 except TypeError: 977 if isinstance(data, collections.abc.Iterable): 978 for d in data: 979 self.sock.sendall(d) 980 else: 981 raise TypeError("data should be a bytes-like object " 982 "or an iterable, got %r" % type(data)) 983 984 def _output(self, s): 985 """Add a line of output to the current request buffer. 986 987 Assumes that the line does *not* end with \\r\\n. 988 """ 989 self._buffer.append(s) 990 991 def _read_readable(self, readable): 992 if self.debuglevel > 0: 993 print("sendIng a read()able") 994 encode = self._is_textIO(readable) 995 if encode and self.debuglevel > 0: 996 print("encoding file using iso-8859-1") 997 while True: 998 datablock = readable.read(self.blocksize) 999 if not datablock: 1000 break 1001 if encode: 1002 datablock = datablock.encode("iso-8859-1") 1003 yield datablock 1004 1005 def _send_output(self, message_body=None, encode_chunked=False): 1006 """Send the currently buffered request and clear the buffer. 1007 1008 Appends an extra \\r\\n to the buffer. 1009 A message_body may be specified, to be appended to the request. 1010 """ 1011 self._buffer.extend((b"", b"")) 1012 msg = b"\r\n".join(self._buffer) 1013 del self._buffer[:] 1014 self.send(msg) 1015 1016 if message_body is not None: 1017 1018 # create a consistent interface to message_body 1019 if hasattr(message_body, 'read'): 1020 # Let file-like take precedence over byte-like. This 1021 # is needed to allow the current position of mmap'ed 1022 # files to be taken into account. 1023 chunks = self._read_readable(message_body) 1024 else: 1025 try: 1026 # this is solely to check to see if message_body 1027 # implements the buffer API. it /would/ be easier 1028 # to capture if PyObject_CheckBuffer was exposed 1029 # to Python. 1030 memoryview(message_body) 1031 except TypeError: 1032 try: 1033 chunks = iter(message_body) 1034 except TypeError: 1035 raise TypeError("message_body should be a bytes-like " 1036 "object or an iterable, got %r" 1037 % type(message_body)) 1038 else: 1039 # the object implements the buffer interface and 1040 # can be passed directly into socket methods 1041 chunks = (message_body,) 1042 1043 for chunk in chunks: 1044 if not chunk: 1045 if self.debuglevel > 0: 1046 print('Zero length chunk ignored') 1047 continue 1048 1049 if encode_chunked and self._http_vsn == 11: 1050 # chunked encoding 1051 chunk = f'{len(chunk):X}\r\n'.encode('ascii') + chunk \ 1052 + b'\r\n' 1053 self.send(chunk) 1054 1055 if encode_chunked and self._http_vsn == 11: 1056 # end chunked transfer 1057 self.send(b'0\r\n\r\n') 1058 1059 def putrequest(self, method, url, skip_host=False, 1060 skip_accept_encoding=False): 1061 """Send a request to the server. 1062 1063 `method' specifies an HTTP request method, e.g. 'GET'. 1064 `url' specifies the object being requested, e.g. '/index.html'. 1065 `skip_host' if True does not add automatically a 'Host:' header 1066 `skip_accept_encoding' if True does not add automatically an 1067 'Accept-Encoding:' header 1068 """ 1069 1070 # if a prior response has been completed, then forget about it. 1071 if self.__response and self.__response.isclosed(): 1072 self.__response = None 1073 1074 1075 # in certain cases, we cannot issue another request on this connection. 1076 # this occurs when: 1077 # 1) we are in the process of sending a request. (_CS_REQ_STARTED) 1078 # 2) a response to a previous request has signalled that it is going 1079 # to close the connection upon completion. 1080 # 3) the headers for the previous response have not been read, thus 1081 # we cannot determine whether point (2) is true. (_CS_REQ_SENT) 1082 # 1083 # if there is no prior response, then we can request at will. 1084 # 1085 # if point (2) is true, then we will have passed the socket to the 1086 # response (effectively meaning, "there is no prior response"), and 1087 # will open a new one when a new request is made. 1088 # 1089 # Note: if a prior response exists, then we *can* start a new request. 1090 # We are not allowed to begin fetching the response to this new 1091 # request, however, until that prior response is complete. 1092 # 1093 if self.__state == _CS_IDLE: 1094 self.__state = _CS_REQ_STARTED 1095 else: 1096 raise CannotSendRequest(self.__state) 1097 1098 self._validate_method(method) 1099 1100 # Save the method for use later in the response phase 1101 self._method = method 1102 1103 url = url or '/' 1104 self._validate_path(url) 1105 1106 request = '%s %s %s' % (method, url, self._http_vsn_str) 1107 1108 self._output(self._encode_request(request)) 1109 1110 if self._http_vsn == 11: 1111 # Issue some standard headers for better HTTP/1.1 compliance 1112 1113 if not skip_host: 1114 # this header is issued *only* for HTTP/1.1 1115 # connections. more specifically, this means it is 1116 # only issued when the client uses the new 1117 # HTTPConnection() class. backwards-compat clients 1118 # will be using HTTP/1.0 and those clients may be 1119 # issuing this header themselves. we should NOT issue 1120 # it twice; some web servers (such as Apache) barf 1121 # when they see two Host: headers 1122 1123 # If we need a non-standard port,include it in the 1124 # header. If the request is going through a proxy, 1125 # but the host of the actual URL, not the host of the 1126 # proxy. 1127 1128 netloc = '' 1129 if url.startswith('http'): 1130 nil, netloc, nil, nil, nil = urlsplit(url) 1131 1132 if netloc: 1133 try: 1134 netloc_enc = netloc.encode("ascii") 1135 except UnicodeEncodeError: 1136 netloc_enc = netloc.encode("idna") 1137 self.putheader('Host', netloc_enc) 1138 else: 1139 if self._tunnel_host: 1140 host = self._tunnel_host 1141 port = self._tunnel_port 1142 else: 1143 host = self.host 1144 port = self.port 1145 1146 try: 1147 host_enc = host.encode("ascii") 1148 except UnicodeEncodeError: 1149 host_enc = host.encode("idna") 1150 1151 # As per RFC 273, IPv6 address should be wrapped with [] 1152 # when used as Host header 1153 1154 if host.find(':') >= 0: 1155 host_enc = b'[' + host_enc + b']' 1156 1157 if port == self.default_port: 1158 self.putheader('Host', host_enc) 1159 else: 1160 host_enc = host_enc.decode("ascii") 1161 self.putheader('Host', "%s:%s" % (host_enc, port)) 1162 1163 # note: we are assuming that clients will not attempt to set these 1164 # headers since *this* library must deal with the 1165 # consequences. this also means that when the supporting 1166 # libraries are updated to recognize other forms, then this 1167 # code should be changed (removed or updated). 1168 1169 # we only want a Content-Encoding of "identity" since we don't 1170 # support encodings such as x-gzip or x-deflate. 1171 if not skip_accept_encoding: 1172 self.putheader('Accept-Encoding', 'identity') 1173 1174 # we can accept "chunked" Transfer-Encodings, but no others 1175 # NOTE: no TE header implies *only* "chunked" 1176 #self.putheader('TE', 'chunked') 1177 1178 # if TE is supplied in the header, then it must appear in a 1179 # Connection header. 1180 #self.putheader('Connection', 'TE') 1181 1182 else: 1183 # For HTTP/1.0, the server will assume "not chunked" 1184 pass 1185 1186 def _encode_request(self, request): 1187 # ASCII also helps prevent CVE-2019-9740. 1188 return request.encode('ascii') 1189 1190 def _validate_method(self, method): 1191 """Validate a method name for putrequest.""" 1192 # prevent http header injection 1193 match = _contains_disallowed_method_pchar_re.search(method) 1194 if match: 1195 raise ValueError( 1196 f"method can't contain control characters. {method!r} " 1197 f"(found at least {match.group()!r})") 1198 1199 def _validate_path(self, url): 1200 """Validate a url for putrequest.""" 1201 # Prevent CVE-2019-9740. 1202 match = _contains_disallowed_url_pchar_re.search(url) 1203 if match: 1204 raise InvalidURL(f"URL can't contain control characters. {url!r} " 1205 f"(found at least {match.group()!r})") 1206 1207 def _validate_host(self, host): 1208 """Validate a host so it doesn't contain control characters.""" 1209 # Prevent CVE-2019-18348. 1210 match = _contains_disallowed_url_pchar_re.search(host) 1211 if match: 1212 raise InvalidURL(f"URL can't contain control characters. {host!r} " 1213 f"(found at least {match.group()!r})") 1214 1215 def putheader(self, header, *values): 1216 """Send a request header line to the server. 1217 1218 For example: h.putheader('Accept', 'text/html') 1219 """ 1220 if self.__state != _CS_REQ_STARTED: 1221 raise CannotSendHeader() 1222 1223 if hasattr(header, 'encode'): 1224 header = header.encode('ascii') 1225 1226 if not _is_legal_header_name(header): 1227 raise ValueError('Invalid header name %r' % (header,)) 1228 1229 values = list(values) 1230 for i, one_value in enumerate(values): 1231 if hasattr(one_value, 'encode'): 1232 values[i] = one_value.encode('latin-1') 1233 elif isinstance(one_value, int): 1234 values[i] = str(one_value).encode('ascii') 1235 1236 if _is_illegal_header_value(values[i]): 1237 raise ValueError('Invalid header value %r' % (values[i],)) 1238 1239 value = b'\r\n\t'.join(values) 1240 header = header + b': ' + value 1241 self._output(header) 1242 1243 def endheaders(self, message_body=None, *, encode_chunked=False): 1244 """Indicate that the last header line has been sent to the server. 1245 1246 This method sends the request to the server. The optional message_body 1247 argument can be used to pass a message body associated with the 1248 request. 1249 """ 1250 if self.__state == _CS_REQ_STARTED: 1251 self.__state = _CS_REQ_SENT 1252 else: 1253 raise CannotSendHeader() 1254 self._send_output(message_body, encode_chunked=encode_chunked) 1255 1256 def request(self, method, url, body=None, headers={}, *, 1257 encode_chunked=False): 1258 """Send a complete request to the server.""" 1259 self._send_request(method, url, body, headers, encode_chunked) 1260 1261 def _send_request(self, method, url, body, headers, encode_chunked): 1262 # Honor explicitly requested Host: and Accept-Encoding: headers. 1263 header_names = frozenset(k.lower() for k in headers) 1264 skips = {} 1265 if 'host' in header_names: 1266 skips['skip_host'] = 1 1267 if 'accept-encoding' in header_names: 1268 skips['skip_accept_encoding'] = 1 1269 1270 self.putrequest(method, url, **skips) 1271 1272 # chunked encoding will happen if HTTP/1.1 is used and either 1273 # the caller passes encode_chunked=True or the following 1274 # conditions hold: 1275 # 1. content-length has not been explicitly set 1276 # 2. the body is a file or iterable, but not a str or bytes-like 1277 # 3. Transfer-Encoding has NOT been explicitly set by the caller 1278 1279 if 'content-length' not in header_names: 1280 # only chunk body if not explicitly set for backwards 1281 # compatibility, assuming the client code is already handling the 1282 # chunking 1283 if 'transfer-encoding' not in header_names: 1284 # if content-length cannot be automatically determined, fall 1285 # back to chunked encoding 1286 encode_chunked = False 1287 content_length = self._get_content_length(body, method) 1288 if content_length is None: 1289 if body is not None: 1290 if self.debuglevel > 0: 1291 print('Unable to determine size of %r' % body) 1292 encode_chunked = True 1293 self.putheader('Transfer-Encoding', 'chunked') 1294 else: 1295 self.putheader('Content-Length', str(content_length)) 1296 else: 1297 encode_chunked = False 1298 1299 for hdr, value in headers.items(): 1300 self.putheader(hdr, value) 1301 if isinstance(body, str): 1302 # RFC 2616 Section 3.7.1 says that text default has a 1303 # default charset of iso-8859-1. 1304 body = _encode(body, 'body') 1305 self.endheaders(body, encode_chunked=encode_chunked) 1306 1307 def getresponse(self): 1308 """Get the response from the server. 1309 1310 If the HTTPConnection is in the correct state, returns an 1311 instance of HTTPResponse or of whatever object is returned by 1312 the response_class variable. 1313 1314 If a request has not been sent or if a previous response has 1315 not be handled, ResponseNotReady is raised. If the HTTP 1316 response indicates that the connection should be closed, then 1317 it will be closed before the response is returned. When the 1318 connection is closed, the underlying socket is closed. 1319 """ 1320 1321 # if a prior response has been completed, then forget about it. 1322 if self.__response and self.__response.isclosed(): 1323 self.__response = None 1324 1325 # if a prior response exists, then it must be completed (otherwise, we 1326 # cannot read this response's header to determine the connection-close 1327 # behavior) 1328 # 1329 # note: if a prior response existed, but was connection-close, then the 1330 # socket and response were made independent of this HTTPConnection 1331 # object since a new request requires that we open a whole new 1332 # connection 1333 # 1334 # this means the prior response had one of two states: 1335 # 1) will_close: this connection was reset and the prior socket and 1336 # response operate independently 1337 # 2) persistent: the response was retained and we await its 1338 # isclosed() status to become true. 1339 # 1340 if self.__state != _CS_REQ_SENT or self.__response: 1341 raise ResponseNotReady(self.__state) 1342 1343 if self.debuglevel > 0: 1344 response = self.response_class(self.sock, self.debuglevel, 1345 method=self._method) 1346 else: 1347 response = self.response_class(self.sock, method=self._method) 1348 1349 try: 1350 try: 1351 response.begin() 1352 except ConnectionError: 1353 self.close() 1354 raise 1355 assert response.will_close != _UNKNOWN 1356 self.__state = _CS_IDLE 1357 1358 if response.will_close: 1359 # this effectively passes the connection to the response 1360 self.close() 1361 else: 1362 # remember this, so we can tell when it is complete 1363 self.__response = response 1364 1365 return response 1366 except: 1367 response.close() 1368 raise 1369 1370try: 1371 import ssl 1372except ImportError: 1373 pass 1374else: 1375 class HTTPSConnection(HTTPConnection): 1376 "This class allows communication via SSL." 1377 1378 default_port = HTTPS_PORT 1379 1380 # XXX Should key_file and cert_file be deprecated in favour of context? 1381 1382 def __init__(self, host, port=None, key_file=None, cert_file=None, 1383 timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 1384 source_address=None, *, context=None, 1385 check_hostname=None, blocksize=8192): 1386 super(HTTPSConnection, self).__init__(host, port, timeout, 1387 source_address, 1388 blocksize=blocksize) 1389 if (key_file is not None or cert_file is not None or 1390 check_hostname is not None): 1391 import warnings 1392 warnings.warn("key_file, cert_file and check_hostname are " 1393 "deprecated, use a custom context instead.", 1394 DeprecationWarning, 2) 1395 self.key_file = key_file 1396 self.cert_file = cert_file 1397 if context is None: 1398 context = ssl._create_default_https_context() 1399 # enable PHA for TLS 1.3 connections if available 1400 if context.post_handshake_auth is not None: 1401 context.post_handshake_auth = True 1402 will_verify = context.verify_mode != ssl.CERT_NONE 1403 if check_hostname is None: 1404 check_hostname = context.check_hostname 1405 if check_hostname and not will_verify: 1406 raise ValueError("check_hostname needs a SSL context with " 1407 "either CERT_OPTIONAL or CERT_REQUIRED") 1408 if key_file or cert_file: 1409 context.load_cert_chain(cert_file, key_file) 1410 # cert and key file means the user wants to authenticate. 1411 # enable TLS 1.3 PHA implicitly even for custom contexts. 1412 if context.post_handshake_auth is not None: 1413 context.post_handshake_auth = True 1414 self._context = context 1415 if check_hostname is not None: 1416 self._context.check_hostname = check_hostname 1417 1418 def connect(self): 1419 "Connect to a host on a given (SSL) port." 1420 1421 super().connect() 1422 1423 if self._tunnel_host: 1424 server_hostname = self._tunnel_host 1425 else: 1426 server_hostname = self.host 1427 1428 self.sock = self._context.wrap_socket(self.sock, 1429 server_hostname=server_hostname) 1430 1431 __all__.append("HTTPSConnection") 1432 1433class HTTPException(Exception): 1434 # Subclasses that define an __init__ must call Exception.__init__ 1435 # or define self.args. Otherwise, str() will fail. 1436 pass 1437 1438class NotConnected(HTTPException): 1439 pass 1440 1441class InvalidURL(HTTPException): 1442 pass 1443 1444class UnknownProtocol(HTTPException): 1445 def __init__(self, version): 1446 self.args = version, 1447 self.version = version 1448 1449class UnknownTransferEncoding(HTTPException): 1450 pass 1451 1452class UnimplementedFileMode(HTTPException): 1453 pass 1454 1455class IncompleteRead(HTTPException): 1456 def __init__(self, partial, expected=None): 1457 self.args = partial, 1458 self.partial = partial 1459 self.expected = expected 1460 def __repr__(self): 1461 if self.expected is not None: 1462 e = ', %i more expected' % self.expected 1463 else: 1464 e = '' 1465 return '%s(%i bytes read%s)' % (self.__class__.__name__, 1466 len(self.partial), e) 1467 __str__ = object.__str__ 1468 1469class ImproperConnectionState(HTTPException): 1470 pass 1471 1472class CannotSendRequest(ImproperConnectionState): 1473 pass 1474 1475class CannotSendHeader(ImproperConnectionState): 1476 pass 1477 1478class ResponseNotReady(ImproperConnectionState): 1479 pass 1480 1481class BadStatusLine(HTTPException): 1482 def __init__(self, line): 1483 if not line: 1484 line = repr(line) 1485 self.args = line, 1486 self.line = line 1487 1488class LineTooLong(HTTPException): 1489 def __init__(self, line_type): 1490 HTTPException.__init__(self, "got more than %d bytes when reading %s" 1491 % (_MAXLINE, line_type)) 1492 1493class RemoteDisconnected(ConnectionResetError, BadStatusLine): 1494 def __init__(self, *pos, **kw): 1495 BadStatusLine.__init__(self, "") 1496 ConnectionResetError.__init__(self, *pos, **kw) 1497 1498# for backwards compatibility 1499error = HTTPException 1500