1from __future__ import generators 2""" 3httplib2 4 5A caching http interface that supports ETags and gzip 6to conserve bandwidth. 7 8Requires Python 2.3 or later 9 10Changelog: 112007-08-18, Rick: Modified so it's able to use a socks proxy if needed. 12 13""" 14 15__author__ = "Joe Gregorio (joe@bitworking.org)" 16__copyright__ = "Copyright 2006, Joe Gregorio" 17__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)", 18 "James Antill", 19 "Xavier Verges Farrero", 20 "Jonathan Feinberg", 21 "Blair Zajac", 22 "Sam Ruby", 23 "Louis Nyffenegger"] 24__license__ = "MIT" 25__version__ = "$Rev$" 26 27import re 28import sys 29import email 30import email.Utils 31import email.Message 32import email.FeedParser 33import StringIO 34import gzip 35import zlib 36import httplib 37import urlparse 38import base64 39import os 40import copy 41import calendar 42import time 43import random 44# remove depracated warning in python2.6 45try: 46 from hashlib import sha1 as _sha, md5 as _md5 47except ImportError: 48 import sha 49 import md5 50 _sha = sha.new 51 _md5 = md5.new 52import hmac 53from gettext import gettext as _ 54import socket 55 56try: 57 import socks 58except ImportError: 59 socks = None 60 61# Build the appropriate socket wrapper for ssl 62try: 63 import ssl # python 2.6 64 _ssl_wrap_socket = ssl.wrap_socket 65except ImportError: 66 def _ssl_wrap_socket(sock, key_file, cert_file): 67 ssl_sock = socket.ssl(sock, key_file, cert_file) 68 return httplib.FakeSocket(sock, ssl_sock) 69 70 71if sys.version_info >= (2,3): 72 from iri2uri import iri2uri 73else: 74 def iri2uri(uri): 75 return uri 76 77def has_timeout(timeout): # python 2.6 78 if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'): 79 return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT) 80 return (timeout is not None) 81 82__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error', 83 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 84 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError', 85 'debuglevel'] 86 87 88# The httplib debug level, set to a non-zero value to get debug output 89debuglevel = 0 90 91 92# Python 2.3 support 93if sys.version_info < (2,4): 94 def sorted(seq): 95 seq.sort() 96 return seq 97 98# Python 2.3 support 99def HTTPResponse__getheaders(self): 100 """Return list of (header, value) tuples.""" 101 if self.msg is None: 102 raise httplib.ResponseNotReady() 103 return self.msg.items() 104 105if not hasattr(httplib.HTTPResponse, 'getheaders'): 106 httplib.HTTPResponse.getheaders = HTTPResponse__getheaders 107 108# All exceptions raised here derive from HttpLib2Error 109class HttpLib2Error(Exception): pass 110 111# Some exceptions can be caught and optionally 112# be turned back into responses. 113class HttpLib2ErrorWithResponse(HttpLib2Error): 114 def __init__(self, desc, response, content): 115 self.response = response 116 self.content = content 117 HttpLib2Error.__init__(self, desc) 118 119class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass 120class RedirectLimit(HttpLib2ErrorWithResponse): pass 121class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass 122class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass 123class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass 124 125class RelativeURIError(HttpLib2Error): pass 126class ServerNotFoundError(HttpLib2Error): pass 127 128# Open Items: 129# ----------- 130# Proxy support 131 132# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?) 133 134# Pluggable cache storage (supports storing the cache in 135# flat files by default. We need a plug-in architecture 136# that can support Berkeley DB and Squid) 137 138# == Known Issues == 139# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator. 140# Does not handle Cache-Control: max-stale 141# Does not use Age: headers when calculating cache freshness. 142 143 144# The number of redirections to follow before giving up. 145# Note that only GET redirects are automatically followed. 146# Will also honor 301 requests by saving that info and never 147# requesting that URI again. 148DEFAULT_MAX_REDIRECTS = 5 149 150# Which headers are hop-by-hop headers by default 151HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade'] 152 153def _get_end2end_headers(response): 154 hopbyhop = list(HOP_BY_HOP) 155 hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')]) 156 return [header for header in response.keys() if header not in hopbyhop] 157 158URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") 159 160def parse_uri(uri): 161 """Parses a URI using the regex given in Appendix B of RFC 3986. 162 163 (scheme, authority, path, query, fragment) = parse_uri(uri) 164 """ 165 groups = URI.match(uri).groups() 166 return (groups[1], groups[3], groups[4], groups[6], groups[8]) 167 168def urlnorm(uri): 169 (scheme, authority, path, query, fragment) = parse_uri(uri) 170 if not scheme or not authority: 171 raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri) 172 authority = authority.lower() 173 scheme = scheme.lower() 174 if not path: 175 path = "/" 176 # Could do syntax based normalization of the URI before 177 # computing the digest. See Section 6.2.2 of Std 66. 178 request_uri = query and "?".join([path, query]) or path 179 scheme = scheme.lower() 180 defrag_uri = scheme + "://" + authority + request_uri 181 return scheme, authority, request_uri, defrag_uri 182 183 184# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/) 185re_url_scheme = re.compile(r'^\w+://') 186re_slash = re.compile(r'[?/:|]+') 187 188def safename(filename): 189 """Return a filename suitable for the cache. 190 191 Strips dangerous and common characters to create a filename we 192 can use to store the cache in. 193 """ 194 195 try: 196 if re_url_scheme.match(filename): 197 if isinstance(filename,str): 198 filename = filename.decode('utf-8') 199 filename = filename.encode('idna') 200 else: 201 filename = filename.encode('idna') 202 except UnicodeError: 203 pass 204 if isinstance(filename,unicode): 205 filename=filename.encode('utf-8') 206 filemd5 = _md5(filename).hexdigest() 207 filename = re_url_scheme.sub("", filename) 208 filename = re_slash.sub(",", filename) 209 210 # limit length of filename 211 if len(filename)>200: 212 filename=filename[:200] 213 return ",".join((filename, filemd5)) 214 215NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+') 216def _normalize_headers(headers): 217 return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()]) 218 219def _parse_cache_control(headers): 220 retval = {} 221 if headers.has_key('cache-control'): 222 parts = headers['cache-control'].split(',') 223 parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")] 224 parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")] 225 retval = dict(parts_with_args + parts_wo_args) 226 return retval 227 228# Whether to use a strict mode to parse WWW-Authenticate headers 229# Might lead to bad results in case of ill-formed header value, 230# so disabled by default, falling back to relaxed parsing. 231# Set to true to turn on, usefull for testing servers. 232USE_WWW_AUTH_STRICT_PARSING = 0 233 234# In regex below: 235# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP 236# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space 237# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both: 238# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"? 239WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$") 240WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$") 241UNQUOTE_PAIRS = re.compile(r'\\(.)') 242def _parse_www_authenticate(headers, headername='www-authenticate'): 243 """Returns a dictionary of dictionaries, one dict 244 per auth_scheme.""" 245 retval = {} 246 if headers.has_key(headername): 247 authenticate = headers[headername].strip() 248 www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED 249 while authenticate: 250 # Break off the scheme at the beginning of the line 251 if headername == 'authentication-info': 252 (auth_scheme, the_rest) = ('digest', authenticate) 253 else: 254 (auth_scheme, the_rest) = authenticate.split(" ", 1) 255 # Now loop over all the key value pairs that come after the scheme, 256 # being careful not to roll into the next scheme 257 match = www_auth.search(the_rest) 258 auth_params = {} 259 while match: 260 if match and len(match.groups()) == 3: 261 (key, value, the_rest) = match.groups() 262 auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')]) 263 match = www_auth.search(the_rest) 264 retval[auth_scheme.lower()] = auth_params 265 authenticate = the_rest.strip() 266 return retval 267 268 269def _entry_disposition(response_headers, request_headers): 270 """Determine freshness from the Date, Expires and Cache-Control headers. 271 272 We don't handle the following: 273 274 1. Cache-Control: max-stale 275 2. Age: headers are not used in the calculations. 276 277 Not that this algorithm is simpler than you might think 278 because we are operating as a private (non-shared) cache. 279 This lets us ignore 's-maxage'. We can also ignore 280 'proxy-invalidate' since we aren't a proxy. 281 We will never return a stale document as 282 fresh as a design decision, and thus the non-implementation 283 of 'max-stale'. This also lets us safely ignore 'must-revalidate' 284 since we operate as if every server has sent 'must-revalidate'. 285 Since we are private we get to ignore both 'public' and 286 'private' parameters. We also ignore 'no-transform' since 287 we don't do any transformations. 288 The 'no-store' parameter is handled at a higher level. 289 So the only Cache-Control parameters we look at are: 290 291 no-cache 292 only-if-cached 293 max-age 294 min-fresh 295 """ 296 297 retval = "STALE" 298 cc = _parse_cache_control(request_headers) 299 cc_response = _parse_cache_control(response_headers) 300 301 if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1: 302 retval = "TRANSPARENT" 303 if 'cache-control' not in request_headers: 304 request_headers['cache-control'] = 'no-cache' 305 elif cc.has_key('no-cache'): 306 retval = "TRANSPARENT" 307 elif cc_response.has_key('no-cache'): 308 retval = "STALE" 309 elif cc.has_key('only-if-cached'): 310 retval = "FRESH" 311 elif response_headers.has_key('date'): 312 date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date'])) 313 now = time.time() 314 current_age = max(0, now - date) 315 if cc_response.has_key('max-age'): 316 try: 317 freshness_lifetime = int(cc_response['max-age']) 318 except ValueError: 319 freshness_lifetime = 0 320 elif response_headers.has_key('expires'): 321 expires = email.Utils.parsedate_tz(response_headers['expires']) 322 if None == expires: 323 freshness_lifetime = 0 324 else: 325 freshness_lifetime = max(0, calendar.timegm(expires) - date) 326 else: 327 freshness_lifetime = 0 328 if cc.has_key('max-age'): 329 try: 330 freshness_lifetime = int(cc['max-age']) 331 except ValueError: 332 freshness_lifetime = 0 333 if cc.has_key('min-fresh'): 334 try: 335 min_fresh = int(cc['min-fresh']) 336 except ValueError: 337 min_fresh = 0 338 current_age += min_fresh 339 if freshness_lifetime > current_age: 340 retval = "FRESH" 341 return retval 342 343def _decompressContent(response, new_content): 344 content = new_content 345 try: 346 encoding = response.get('content-encoding', None) 347 if encoding in ['gzip', 'deflate']: 348 if encoding == 'gzip': 349 content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read() 350 if encoding == 'deflate': 351 content = zlib.decompress(content) 352 response['content-length'] = str(len(content)) 353 # Record the historical presence of the encoding in a way the won't interfere. 354 response['-content-encoding'] = response['content-encoding'] 355 del response['content-encoding'] 356 except IOError: 357 content = "" 358 raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content) 359 return content 360 361def _updateCache(request_headers, response_headers, content, cache, cachekey): 362 if cachekey: 363 cc = _parse_cache_control(request_headers) 364 cc_response = _parse_cache_control(response_headers) 365 if cc.has_key('no-store') or cc_response.has_key('no-store'): 366 cache.delete(cachekey) 367 else: 368 info = email.Message.Message() 369 for key, value in response_headers.iteritems(): 370 if key not in ['status','content-encoding','transfer-encoding']: 371 info[key] = value 372 373 # Add annotations to the cache to indicate what headers 374 # are variant for this request. 375 vary = response_headers.get('vary', None) 376 if vary: 377 vary_headers = vary.lower().replace(' ', '').split(',') 378 for header in vary_headers: 379 key = '-varied-%s' % header 380 try: 381 info[key] = request_headers[header] 382 except KeyError: 383 pass 384 385 status = response_headers.status 386 if status == 304: 387 status = 200 388 389 status_header = 'status: %d\r\n' % response_headers.status 390 391 header_str = info.as_string() 392 393 header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str) 394 text = "".join([status_header, header_str, content]) 395 396 cache.set(cachekey, text) 397 398def _cnonce(): 399 dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest() 400 return dig[:16] 401 402def _wsse_username_token(cnonce, iso_now, password): 403 return base64.b64encode(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip() 404 405 406# For credentials we need two things, first 407# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.) 408# Then we also need a list of URIs that have already demanded authentication 409# That list is tricky since sub-URIs can take the same auth, or the 410# auth scheme may change as you descend the tree. 411# So we also need each Auth instance to be able to tell us 412# how close to the 'top' it is. 413 414class Authentication(object): 415 def __init__(self, credentials, host, request_uri, headers, response, content, http): 416 (scheme, authority, path, query, fragment) = parse_uri(request_uri) 417 self.path = path 418 self.host = host 419 self.credentials = credentials 420 self.http = http 421 422 def depth(self, request_uri): 423 (scheme, authority, path, query, fragment) = parse_uri(request_uri) 424 return request_uri[len(self.path):].count("/") 425 426 def inscope(self, host, request_uri): 427 # XXX Should we normalize the request_uri? 428 (scheme, authority, path, query, fragment) = parse_uri(request_uri) 429 return (host == self.host) and path.startswith(self.path) 430 431 def request(self, method, request_uri, headers, content): 432 """Modify the request headers to add the appropriate 433 Authorization header. Over-rise this in sub-classes.""" 434 pass 435 436 def response(self, response, content): 437 """Gives us a chance to update with new nonces 438 or such returned from the last authorized response. 439 Over-rise this in sub-classes if necessary. 440 441 Return TRUE is the request is to be retried, for 442 example Digest may return stale=true. 443 """ 444 return False 445 446 447 448class BasicAuthentication(Authentication): 449 def __init__(self, credentials, host, request_uri, headers, response, content, http): 450 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) 451 452 def request(self, method, request_uri, headers, content): 453 """Modify the request headers to add the appropriate 454 Authorization header.""" 455 headers['authorization'] = 'Basic ' + base64.b64encode("%s:%s" % self.credentials).strip() 456 457 458class DigestAuthentication(Authentication): 459 """Only do qop='auth' and MD5, since that 460 is all Apache currently implements""" 461 def __init__(self, credentials, host, request_uri, headers, response, content, http): 462 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) 463 challenge = _parse_www_authenticate(response, 'www-authenticate') 464 self.challenge = challenge['digest'] 465 qop = self.challenge.get('qop', 'auth') 466 self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None 467 if self.challenge['qop'] is None: 468 raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop)) 469 self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper() 470 if self.challenge['algorithm'] != 'MD5': 471 raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm'])) 472 self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]]) 473 self.challenge['nc'] = 1 474 475 def request(self, method, request_uri, headers, content, cnonce = None): 476 """Modify the request headers""" 477 H = lambda x: _md5(x).hexdigest() 478 KD = lambda s, d: H("%s:%s" % (s, d)) 479 A2 = "".join([method, ":", request_uri]) 480 self.challenge['cnonce'] = cnonce or _cnonce() 481 request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'], 482 '%08x' % self.challenge['nc'], 483 self.challenge['cnonce'], 484 self.challenge['qop'], H(A2) 485 )) 486 headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % ( 487 self.credentials[0], 488 self.challenge['realm'], 489 self.challenge['nonce'], 490 request_uri, 491 self.challenge['algorithm'], 492 request_digest, 493 self.challenge['qop'], 494 self.challenge['nc'], 495 self.challenge['cnonce'], 496 ) 497 self.challenge['nc'] += 1 498 499 def response(self, response, content): 500 if not response.has_key('authentication-info'): 501 challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {}) 502 if 'true' == challenge.get('stale'): 503 self.challenge['nonce'] = challenge['nonce'] 504 self.challenge['nc'] = 1 505 return True 506 else: 507 updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {}) 508 509 if updated_challenge.has_key('nextnonce'): 510 self.challenge['nonce'] = updated_challenge['nextnonce'] 511 self.challenge['nc'] = 1 512 return False 513 514 515class HmacDigestAuthentication(Authentication): 516 """Adapted from Robert Sayre's code and DigestAuthentication above.""" 517 __author__ = "Thomas Broyer (t.broyer@ltgt.net)" 518 519 def __init__(self, credentials, host, request_uri, headers, response, content, http): 520 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) 521 challenge = _parse_www_authenticate(response, 'www-authenticate') 522 self.challenge = challenge['hmacdigest'] 523 # TODO: self.challenge['domain'] 524 self.challenge['reason'] = self.challenge.get('reason', 'unauthorized') 525 if self.challenge['reason'] not in ['unauthorized', 'integrity']: 526 self.challenge['reason'] = 'unauthorized' 527 self.challenge['salt'] = self.challenge.get('salt', '') 528 if not self.challenge.get('snonce'): 529 raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty.")) 530 self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1') 531 if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']: 532 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm'])) 533 self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1') 534 if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']: 535 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm'])) 536 if self.challenge['algorithm'] == 'HMAC-MD5': 537 self.hashmod = _md5 538 else: 539 self.hashmod = _sha 540 if self.challenge['pw-algorithm'] == 'MD5': 541 self.pwhashmod = _md5 542 else: 543 self.pwhashmod = _sha 544 self.key = "".join([self.credentials[0], ":", 545 self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(), 546 ":", self.challenge['realm'] 547 ]) 548 self.key = self.pwhashmod.new(self.key).hexdigest().lower() 549 550 def request(self, method, request_uri, headers, content): 551 """Modify the request headers""" 552 keys = _get_end2end_headers(headers) 553 keylist = "".join(["%s " % k for k in keys]) 554 headers_val = "".join([headers[k] for k in keys]) 555 created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime()) 556 cnonce = _cnonce() 557 request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val) 558 request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower() 559 headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % ( 560 self.credentials[0], 561 self.challenge['realm'], 562 self.challenge['snonce'], 563 cnonce, 564 request_uri, 565 created, 566 request_digest, 567 keylist, 568 ) 569 570 def response(self, response, content): 571 challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {}) 572 if challenge.get('reason') in ['integrity', 'stale']: 573 return True 574 return False 575 576 577class WsseAuthentication(Authentication): 578 """This is thinly tested and should not be relied upon. 579 At this time there isn't any third party server to test against. 580 Blogger and TypePad implemented this algorithm at one point 581 but Blogger has since switched to Basic over HTTPS and 582 TypePad has implemented it wrong, by never issuing a 401 583 challenge but instead requiring your client to telepathically know that 584 their endpoint is expecting WSSE profile="UsernameToken".""" 585 def __init__(self, credentials, host, request_uri, headers, response, content, http): 586 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) 587 588 def request(self, method, request_uri, headers, content): 589 """Modify the request headers to add the appropriate 590 Authorization header.""" 591 headers['Authorization'] = 'WSSE profile="UsernameToken"' 592 iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) 593 cnonce = _cnonce() 594 password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1]) 595 headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % ( 596 self.credentials[0], 597 password_digest, 598 cnonce, 599 iso_now) 600 601class GoogleLoginAuthentication(Authentication): 602 def __init__(self, credentials, host, request_uri, headers, response, content, http): 603 from urllib import urlencode 604 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) 605 challenge = _parse_www_authenticate(response, 'www-authenticate') 606 service = challenge['googlelogin'].get('service', 'xapi') 607 # Bloggger actually returns the service in the challenge 608 # For the rest we guess based on the URI 609 if service == 'xapi' and request_uri.find("calendar") > 0: 610 service = "cl" 611 # No point in guessing Base or Spreadsheet 612 #elif request_uri.find("spreadsheets") > 0: 613 # service = "wise" 614 615 auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent']) 616 resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'}) 617 lines = content.split('\n') 618 d = dict([tuple(line.split("=", 1)) for line in lines if line]) 619 if resp.status == 403: 620 self.Auth = "" 621 else: 622 self.Auth = d['Auth'] 623 624 def request(self, method, request_uri, headers, content): 625 """Modify the request headers to add the appropriate 626 Authorization header.""" 627 headers['authorization'] = 'GoogleLogin Auth=' + self.Auth 628 629 630AUTH_SCHEME_CLASSES = { 631 "basic": BasicAuthentication, 632 "wsse": WsseAuthentication, 633 "digest": DigestAuthentication, 634 "hmacdigest": HmacDigestAuthentication, 635 "googlelogin": GoogleLoginAuthentication 636} 637 638AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"] 639 640class FileCache(object): 641 """Uses a local directory as a store for cached files. 642 Not really safe to use if multiple threads or processes are going to 643 be running on the same cache. 644 """ 645 def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior 646 self.cache = cache 647 self.safe = safe 648 if not os.path.exists(cache): 649 os.makedirs(self.cache) 650 651 def get(self, key): 652 retval = None 653 cacheFullPath = os.path.join(self.cache, self.safe(key)) 654 try: 655 f = file(cacheFullPath, "rb") 656 retval = f.read() 657 f.close() 658 except IOError: 659 pass 660 return retval 661 662 def set(self, key, value): 663 cacheFullPath = os.path.join(self.cache, self.safe(key)) 664 f = file(cacheFullPath, "wb") 665 f.write(value) 666 f.close() 667 668 def delete(self, key): 669 cacheFullPath = os.path.join(self.cache, self.safe(key)) 670 if os.path.exists(cacheFullPath): 671 os.remove(cacheFullPath) 672 673class Credentials(object): 674 def __init__(self): 675 self.credentials = [] 676 677 def add(self, name, password, domain=""): 678 self.credentials.append((domain.lower(), name, password)) 679 680 def clear(self): 681 self.credentials = [] 682 683 def iter(self, domain): 684 for (cdomain, name, password) in self.credentials: 685 if cdomain == "" or domain == cdomain: 686 yield (name, password) 687 688class KeyCerts(Credentials): 689 """Identical to Credentials except that 690 name/password are mapped to key/cert.""" 691 pass 692 693 694class ProxyInfo(object): 695 """Collect information required to use a proxy.""" 696 def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None): 697 """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX 698 constants. For example: 699 700p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000) 701 """ 702 self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass 703 704 def astuple(self): 705 return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, 706 self.proxy_user, self.proxy_pass) 707 708 def isgood(self): 709 return socks and (self.proxy_host != None) and (self.proxy_port != None) 710 711 712class HTTPConnectionWithTimeout(httplib.HTTPConnection): 713 """HTTPConnection subclass that supports timeouts""" 714 715 def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None): 716 httplib.HTTPConnection.__init__(self, host, port, strict) 717 self.timeout = timeout 718 self.proxy_info = proxy_info 719 720 def connect(self): 721 """Connect to the host and port specified in __init__.""" 722 # Mostly verbatim from httplib.py. 723 msg = "getaddrinfo returns an empty list" 724 for res in socket.getaddrinfo(self.host, self.port, 0, 725 socket.SOCK_STREAM): 726 af, socktype, proto, canonname, sa = res 727 try: 728 if self.proxy_info and self.proxy_info.isgood(): 729 self.sock = socks.socksocket(af, socktype, proto) 730 self.sock.setproxy(*self.proxy_info.astuple()) 731 else: 732 self.sock = socket.socket(af, socktype, proto) 733 # Different from httplib: support timeouts. 734 if has_timeout(self.timeout): 735 self.sock.settimeout(self.timeout) 736 # End of difference from httplib. 737 if self.debuglevel > 0: 738 print "connect: (%s, %s)" % (self.host, self.port) 739 740 self.sock.connect(sa) 741 except socket.error, msg: 742 if self.debuglevel > 0: 743 print 'connect fail:', (self.host, self.port) 744 if self.sock: 745 self.sock.close() 746 self.sock = None 747 continue 748 break 749 if not self.sock: 750 raise socket.error, msg 751 752class HTTPSConnectionWithTimeout(httplib.HTTPSConnection): 753 "This class allows communication via SSL." 754 755 def __init__(self, host, port=None, key_file=None, cert_file=None, 756 strict=None, timeout=None, proxy_info=None): 757 httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file, 758 cert_file=cert_file, strict=strict) 759 self.timeout = timeout 760 self.proxy_info = proxy_info 761 762 def connect(self): 763 "Connect to a host on a given (SSL) port." 764 765 if self.proxy_info and self.proxy_info.isgood(): 766 sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM) 767 sock.setproxy(*self.proxy_info.astuple()) 768 else: 769 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 770 771 if has_timeout(self.timeout): 772 sock.settimeout(self.timeout) 773 sock.connect((self.host, self.port)) 774 self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file) 775 776 777 778class Http(object): 779 """An HTTP client that handles: 780- all methods 781- caching 782- ETags 783- compression, 784- HTTPS 785- Basic 786- Digest 787- WSSE 788 789and more. 790 """ 791 def __init__(self, cache=None, timeout=None, proxy_info=None): 792 """The value of proxy_info is a ProxyInfo instance. 793 794If 'cache' is a string then it is used as a directory name 795for a disk cache. Otherwise it must be an object that supports 796the same interface as FileCache.""" 797 self.proxy_info = proxy_info 798 # Map domain name to an httplib connection 799 self.connections = {} 800 # The location of the cache, for now a directory 801 # where cached responses are held. 802 if cache and isinstance(cache, str): 803 self.cache = FileCache(cache) 804 else: 805 self.cache = cache 806 807 # Name/password 808 self.credentials = Credentials() 809 810 # Key/cert 811 self.certificates = KeyCerts() 812 813 # authorization objects 814 self.authorizations = [] 815 816 # If set to False then no redirects are followed, even safe ones. 817 self.follow_redirects = True 818 819 # Which HTTP methods do we apply optimistic concurrency to, i.e. 820 # which methods get an "if-match:" etag header added to them. 821 self.optimistic_concurrency_methods = ["PUT"] 822 823 # If 'follow_redirects' is True, and this is set to True then 824 # all redirecs are followed, including unsafe ones. 825 self.follow_all_redirects = False 826 827 self.ignore_etag = False 828 829 self.force_exception_to_status_code = False 830 831 self.timeout = timeout 832 833 def _auth_from_challenge(self, host, request_uri, headers, response, content): 834 """A generator that creates Authorization objects 835 that can be applied to requests. 836 """ 837 challenges = _parse_www_authenticate(response, 'www-authenticate') 838 for cred in self.credentials.iter(host): 839 for scheme in AUTH_SCHEME_ORDER: 840 if challenges.has_key(scheme): 841 yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self) 842 843 def add_credentials(self, name, password, domain=""): 844 """Add a name and password that will be used 845 any time a request requires authentication.""" 846 self.credentials.add(name, password, domain) 847 848 def add_certificate(self, key, cert, domain): 849 """Add a key and cert that will be used 850 any time a request requires authentication.""" 851 self.certificates.add(key, cert, domain) 852 853 def clear_credentials(self): 854 """Remove all the names and passwords 855 that are used for authentication""" 856 self.credentials.clear() 857 self.authorizations = [] 858 859 def _conn_request(self, conn, request_uri, method, body, headers): 860 for i in range(2): 861 try: 862 conn.request(method, request_uri, body, headers) 863 except socket.gaierror: 864 conn.close() 865 raise ServerNotFoundError("Unable to find the server at %s" % conn.host) 866 except (socket.error, httplib.HTTPException): 867 # Just because the server closed the connection doesn't apparently mean 868 # that the server didn't send a response. 869 pass 870 try: 871 response = conn.getresponse() 872 except (socket.error, httplib.HTTPException): 873 if i == 0: 874 conn.close() 875 conn.connect() 876 continue 877 else: 878 raise 879 else: 880 content = "" 881 if method == "HEAD": 882 response.close() 883 else: 884 content = response.read() 885 response = Response(response) 886 if method != "HEAD": 887 content = _decompressContent(response, content) 888 break 889 return (response, content) 890 891 892 def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey): 893 """Do the actual request using the connection object 894 and also follow one level of redirects if necessary""" 895 896 auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)] 897 auth = auths and sorted(auths)[0][1] or None 898 if auth: 899 auth.request(method, request_uri, headers, body) 900 901 (response, content) = self._conn_request(conn, request_uri, method, body, headers) 902 903 if auth: 904 if auth.response(response, body): 905 auth.request(method, request_uri, headers, body) 906 (response, content) = self._conn_request(conn, request_uri, method, body, headers ) 907 response._stale_digest = 1 908 909 if response.status == 401: 910 for authorization in self._auth_from_challenge(host, request_uri, headers, response, content): 911 authorization.request(method, request_uri, headers, body) 912 (response, content) = self._conn_request(conn, request_uri, method, body, headers, ) 913 if response.status != 401: 914 self.authorizations.append(authorization) 915 authorization.response(response, body) 916 break 917 918 if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303): 919 if self.follow_redirects and response.status in [300, 301, 302, 303, 307]: 920 # Pick out the location header and basically start from the beginning 921 # remembering first to strip the ETag header and decrement our 'depth' 922 if redirections: 923 if not response.has_key('location') and response.status != 300: 924 raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content) 925 # Fix-up relative redirects (which violate an RFC 2616 MUST) 926 if response.has_key('location'): 927 location = response['location'] 928 (scheme, authority, path, query, fragment) = parse_uri(location) 929 if authority == None: 930 response['location'] = urlparse.urljoin(absolute_uri, location) 931 if response.status == 301 and method in ["GET", "HEAD"]: 932 response['-x-permanent-redirect-url'] = response['location'] 933 if not response.has_key('content-location'): 934 response['content-location'] = absolute_uri 935 _updateCache(headers, response, content, self.cache, cachekey) 936 if headers.has_key('if-none-match'): 937 del headers['if-none-match'] 938 if headers.has_key('if-modified-since'): 939 del headers['if-modified-since'] 940 if response.has_key('location'): 941 location = response['location'] 942 old_response = copy.deepcopy(response) 943 if not old_response.has_key('content-location'): 944 old_response['content-location'] = absolute_uri 945 redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method 946 (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1) 947 response.previous = old_response 948 else: 949 raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content) 950 elif response.status in [200, 203] and method == "GET": 951 # Don't cache 206's since we aren't going to handle byte range requests 952 if not response.has_key('content-location'): 953 response['content-location'] = absolute_uri 954 _updateCache(headers, response, content, self.cache, cachekey) 955 956 return (response, content) 957 958 def _normalize_headers(self, headers): 959 return _normalize_headers(headers) 960 961# Need to catch and rebrand some exceptions 962# Then need to optionally turn all exceptions into status codes 963# including all socket.* and httplib.* exceptions. 964 965 966 def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None): 967 """ Performs a single HTTP request. 968The 'uri' is the URI of the HTTP resource and can begin 969with either 'http' or 'https'. The value of 'uri' must be an absolute URI. 970 971The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. 972There is no restriction on the methods allowed. 973 974The 'body' is the entity body to be sent with the request. It is a string 975object. 976 977Any extra headers that are to be sent with the request should be provided in the 978'headers' dictionary. 979 980The maximum number of redirect to follow before raising an 981exception is 'redirections. The default is 5. 982 983The return value is a tuple of (response, content), the first 984being and instance of the 'Response' class, the second being 985a string that contains the response entity body. 986 """ 987 try: 988 if headers is None: 989 headers = {} 990 else: 991 headers = self._normalize_headers(headers) 992 993 if not headers.has_key('user-agent'): 994 headers['user-agent'] = "Python-httplib2/%s" % __version__ 995 996 uri = iri2uri(uri) 997 998 (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) 999 domain_port = authority.split(":")[0:2] 1000 if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http': 1001 scheme = 'https' 1002 authority = domain_port[0] 1003 1004 conn_key = scheme+":"+authority 1005 if conn_key in self.connections: 1006 conn = self.connections[conn_key] 1007 else: 1008 if not connection_type: 1009 connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout 1010 certs = list(self.certificates.iter(authority)) 1011 if scheme == 'https' and certs: 1012 conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0], 1013 cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) 1014 else: 1015 conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info) 1016 conn.set_debuglevel(debuglevel) 1017 1018 if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers: 1019 headers['accept-encoding'] = 'gzip, deflate' 1020 1021 info = email.Message.Message() 1022 cached_value = None 1023 if self.cache: 1024 cachekey = defrag_uri 1025 cached_value = self.cache.get(cachekey) 1026 if cached_value: 1027 # info = email.message_from_string(cached_value) 1028 # 1029 # Need to replace the line above with the kludge below 1030 # to fix the non-existent bug not fixed in this 1031 # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html 1032 try: 1033 info, content = cached_value.split('\r\n\r\n', 1) 1034 feedparser = email.FeedParser.FeedParser() 1035 feedparser.feed(info) 1036 info = feedparser.close() 1037 feedparser._parse = None 1038 except IndexError: 1039 self.cache.delete(cachekey) 1040 cachekey = None 1041 cached_value = None 1042 else: 1043 cachekey = None 1044 1045 if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers: 1046 # http://www.w3.org/1999/04/Editing/ 1047 headers['if-match'] = info['etag'] 1048 1049 if method not in ["GET", "HEAD"] and self.cache and cachekey: 1050 # RFC 2616 Section 13.10 1051 self.cache.delete(cachekey) 1052 1053 # Check the vary header in the cache to see if this request 1054 # matches what varies in the cache. 1055 if method in ['GET', 'HEAD'] and 'vary' in info: 1056 vary = info['vary'] 1057 vary_headers = vary.lower().replace(' ', '').split(',') 1058 for header in vary_headers: 1059 key = '-varied-%s' % header 1060 value = info[key] 1061 if headers.get(header, '') != value: 1062 cached_value = None 1063 break 1064 1065 if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: 1066 if info.has_key('-x-permanent-redirect-url'): 1067 # Should cached permanent redirects be counted in our redirection count? For now, yes. 1068 (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) 1069 response.previous = Response(info) 1070 response.previous.fromcache = True 1071 else: 1072 # Determine our course of action: 1073 # Is the cached entry fresh or stale? 1074 # Has the client requested a non-cached response? 1075 # 1076 # There seems to be three possible answers: 1077 # 1. [FRESH] Return the cache entry w/o doing a GET 1078 # 2. [STALE] Do the GET (but add in cache validators if available) 1079 # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request 1080 entry_disposition = _entry_disposition(info, headers) 1081 1082 if entry_disposition == "FRESH": 1083 if not cached_value: 1084 info['status'] = '504' 1085 content = "" 1086 response = Response(info) 1087 if cached_value: 1088 response.fromcache = True 1089 return (response, content) 1090 1091 if entry_disposition == "STALE": 1092 if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: 1093 headers['if-none-match'] = info['etag'] 1094 if info.has_key('last-modified') and not 'last-modified' in headers: 1095 headers['if-modified-since'] = info['last-modified'] 1096 elif entry_disposition == "TRANSPARENT": 1097 pass 1098 1099 (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) 1100 1101 if response.status == 304 and method == "GET": 1102 # Rewrite the cache entry with the new end-to-end headers 1103 # Take all headers that are in response 1104 # and overwrite their values in info. 1105 # unless they are hop-by-hop, or are listed in the connection header. 1106 1107 for key in _get_end2end_headers(response): 1108 info[key] = response[key] 1109 merged_response = Response(info) 1110 if hasattr(response, "_stale_digest"): 1111 merged_response._stale_digest = response._stale_digest 1112 _updateCache(headers, merged_response, content, self.cache, cachekey) 1113 response = merged_response 1114 response.status = 200 1115 response.fromcache = True 1116 1117 elif response.status == 200: 1118 content = new_content 1119 else: 1120 self.cache.delete(cachekey) 1121 content = new_content 1122 else: 1123 cc = _parse_cache_control(headers) 1124 if cc.has_key('only-if-cached'): 1125 info['status'] = '504' 1126 response = Response(info) 1127 content = "" 1128 else: 1129 (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) 1130 except Exception, e: 1131 if self.force_exception_to_status_code: 1132 if isinstance(e, HttpLib2ErrorWithResponse): 1133 response = e.response 1134 content = e.content 1135 response.status = 500 1136 response.reason = str(e) 1137 elif isinstance(e, socket.timeout): 1138 content = "Request Timeout" 1139 response = Response( { 1140 "content-type": "text/plain", 1141 "status": "408", 1142 "content-length": len(content) 1143 }) 1144 response.reason = "Request Timeout" 1145 else: 1146 content = str(e) 1147 response = Response( { 1148 "content-type": "text/plain", 1149 "status": "400", 1150 "content-length": len(content) 1151 }) 1152 response.reason = "Bad Request" 1153 else: 1154 raise 1155 1156 1157 return (response, content) 1158 1159 1160 1161class Response(dict): 1162 """An object more like email.Message than httplib.HTTPResponse.""" 1163 1164 """Is this response from our local cache""" 1165 fromcache = False 1166 1167 """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """ 1168 version = 11 1169 1170 "Status code returned by server. " 1171 status = 200 1172 1173 """Reason phrase returned by server.""" 1174 reason = "Ok" 1175 1176 previous = None 1177 1178 def __init__(self, info): 1179 # info is either an email.Message or 1180 # an httplib.HTTPResponse object. 1181 if isinstance(info, httplib.HTTPResponse): 1182 for key, value in info.getheaders(): 1183 self[key.lower()] = value 1184 self.status = info.status 1185 self['status'] = str(self.status) 1186 self.reason = info.reason 1187 self.version = info.version 1188 elif isinstance(info, email.Message.Message): 1189 for key, value in info.items(): 1190 self[key] = value 1191 self.status = int(self['status']) 1192 else: 1193 for key, value in info.iteritems(): 1194 self[key] = value 1195 self.status = int(self.get('status', self.status)) 1196 1197 1198 def __getattr__(self, name): 1199 if name == 'dict': 1200 return self 1201 else: 1202 raise AttributeError, name 1203