1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError', 30 'MozillaCookieJar'] 31 32import re, urlparse, copy, time, urllib 33try: 34 import threading as _threading 35except ImportError: 36 import dummy_threading as _threading 37import httplib # only for the default HTTP port 38from calendar import timegm 39 40debug = False # set to True to enable debugging via the logging module 41logger = None 42 43def _debug(*args): 44 if not debug: 45 return 46 global logger 47 if not logger: 48 import logging 49 logger = logging.getLogger("cookielib") 50 return logger.debug(*args) 51 52 53DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) 54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 55 "instance initialised with one)") 56 57def _warn_unhandled_exception(): 58 # There are a few catch-all except: statements in this module, for 59 # catching input that's bad in unexpected ways. Warn if any 60 # exceptions are caught there. 61 import warnings, traceback, StringIO 62 f = StringIO.StringIO() 63 traceback.print_exc(None, f) 64 msg = f.getvalue() 65 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2) 66 67 68# Date/time conversion 69# ----------------------------------------------------------------------------- 70 71EPOCH_YEAR = 1970 72def _timegm(tt): 73 year, month, mday, hour, min, sec = tt[:6] 74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 76 return timegm(tt) 77 else: 78 return None 79 80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 83MONTHS_LOWER = [] 84for month in MONTHS: MONTHS_LOWER.append(month.lower()) 85 86def time2isoz(t=None): 87 """Return a string representing time in seconds since epoch, t. 88 89 If the function is called without an argument, it will use the current 90 time. 91 92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 93 representing Universal Time (UTC, aka GMT). An example of this format is: 94 95 1994-11-24 08:49:37Z 96 97 """ 98 if t is None: t = time.time() 99 year, mon, mday, hour, min, sec = time.gmtime(t)[:6] 100 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 101 year, mon, mday, hour, min, sec) 102 103def time2netscape(t=None): 104 """Return a string representing time in seconds since epoch, t. 105 106 If the function is called without an argument, it will use the current 107 time. 108 109 The format of the returned string is like this: 110 111 Wed, DD-Mon-YYYY HH:MM:SS GMT 112 113 """ 114 if t is None: t = time.time() 115 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] 116 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 117 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec) 118 119 120UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 121 122TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") 123def offset_from_tz_string(tz): 124 offset = None 125 if tz in UTC_ZONES: 126 offset = 0 127 else: 128 m = TIMEZONE_RE.search(tz) 129 if m: 130 offset = 3600 * int(m.group(2)) 131 if m.group(3): 132 offset = offset + 60 * int(m.group(3)) 133 if m.group(1) == '-': 134 offset = -offset 135 return offset 136 137def _str2time(day, mon, yr, hr, min, sec, tz): 138 # translate month name to number 139 # month numbers start with 1 (January) 140 try: 141 mon = MONTHS_LOWER.index(mon.lower())+1 142 except ValueError: 143 # maybe it's already a number 144 try: 145 imon = int(mon) 146 except ValueError: 147 return None 148 if 1 <= imon <= 12: 149 mon = imon 150 else: 151 return None 152 153 # make sure clock elements are defined 154 if hr is None: hr = 0 155 if min is None: min = 0 156 if sec is None: sec = 0 157 158 yr = int(yr) 159 day = int(day) 160 hr = int(hr) 161 min = int(min) 162 sec = int(sec) 163 164 if yr < 1000: 165 # find "obvious" year 166 cur_yr = time.localtime(time.time())[0] 167 m = cur_yr % 100 168 tmp = yr 169 yr = yr + cur_yr - m 170 m = m - tmp 171 if abs(m) > 50: 172 if m > 0: yr = yr + 100 173 else: yr = yr - 100 174 175 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 176 t = _timegm((yr, mon, day, hr, min, sec, tz)) 177 178 if t is not None: 179 # adjust time using timezone string, to get absolute time since epoch 180 if tz is None: 181 tz = "UTC" 182 tz = tz.upper() 183 offset = offset_from_tz_string(tz) 184 if offset is None: 185 return None 186 t = t - offset 187 188 return t 189 190STRICT_DATE_RE = re.compile( 191 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 192 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") 193WEEKDAY_RE = re.compile( 194 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) 195LOOSE_HTTP_DATE_RE = re.compile( 196 r"""^ 197 (\d\d?) # day 198 (?:\s+|[-\/]) 199 (\w+) # month 200 (?:\s+|[-\/]) 201 (\d+) # year 202 (?: 203 (?:\s+|:) # separator before clock 204 (\d\d?):(\d\d) # hour:min 205 (?::(\d\d))? # optional seconds 206 )? # optional clock 207 \s* 208 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone 209 \s* 210 (?:\(\w+\))? # ASCII representation of timezone in parens. 211 \s*$""", re.X) 212def http2time(text): 213 """Returns time in seconds since epoch of time represented by a string. 214 215 Return value is an integer. 216 217 None is returned if the format of str is unrecognized, the time is outside 218 the representable range, or the timezone string is not recognized. If the 219 string contains no timezone, UTC is assumed. 220 221 The timezone in the string may be numerical (like "-0800" or "+0100") or a 222 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 223 timezone strings equivalent to UTC (zero offset) are known to the function. 224 225 The function loosely parses the following formats: 226 227 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 228 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 229 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 230 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 231 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 232 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 233 234 The parser ignores leading and trailing whitespace. The time may be 235 absent. 236 237 If the year is given with only 2 digits, the function will select the 238 century that makes the year closest to the current date. 239 240 """ 241 # fast exit for strictly conforming string 242 m = STRICT_DATE_RE.search(text) 243 if m: 244 g = m.groups() 245 mon = MONTHS_LOWER.index(g[1].lower()) + 1 246 tt = (int(g[2]), mon, int(g[0]), 247 int(g[3]), int(g[4]), float(g[5])) 248 return _timegm(tt) 249 250 # No, we need some messy parsing... 251 252 # clean up 253 text = text.lstrip() 254 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 255 256 # tz is time zone specifier string 257 day, mon, yr, hr, min, sec, tz = [None]*7 258 259 # loose regexp parse 260 m = LOOSE_HTTP_DATE_RE.search(text) 261 if m is not None: 262 day, mon, yr, hr, min, sec, tz = m.groups() 263 else: 264 return None # bad format 265 266 return _str2time(day, mon, yr, hr, min, sec, tz) 267 268ISO_DATE_RE = re.compile( 269 """^ 270 (\d{4}) # year 271 [-\/]? 272 (\d\d?) # numerical month 273 [-\/]? 274 (\d\d?) # day 275 (?: 276 (?:\s+|[-:Tt]) # separator before clock 277 (\d\d?):?(\d\d) # hour:min 278 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 279 )? # optional clock 280 \s* 281 ([-+]?\d\d?:?(:?\d\d)? 282 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) 283 \s*$""", re.X) 284def iso2time(text): 285 """ 286 As for http2time, but parses the ISO 8601 formats: 287 288 1994-02-03 14:15:29 -0100 -- ISO 8601 format 289 1994-02-03 14:15:29 -- zone is optional 290 1994-02-03 -- only date 291 1994-02-03T14:15:29 -- Use T as separator 292 19940203T141529Z -- ISO 8601 compact format 293 19940203 -- only date 294 295 """ 296 # clean up 297 text = text.lstrip() 298 299 # tz is time zone specifier string 300 day, mon, yr, hr, min, sec, tz = [None]*7 301 302 # loose regexp parse 303 m = ISO_DATE_RE.search(text) 304 if m is not None: 305 # XXX there's an extra bit of the timezone I'm ignoring here: is 306 # this the right thing to do? 307 yr, mon, day, hr, min, sec, tz, _ = m.groups() 308 else: 309 return None # bad format 310 311 return _str2time(day, mon, yr, hr, min, sec, tz) 312 313 314# Header parsing 315# ----------------------------------------------------------------------------- 316 317def unmatched(match): 318 """Return unmatched part of re.Match object.""" 319 start, end = match.span(0) 320 return match.string[:start]+match.string[end:] 321 322HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 323HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 324HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 325HEADER_ESCAPE_RE = re.compile(r"\\(.)") 326def split_header_words(header_values): 327 r"""Parse header values into a list of lists containing key,value pairs. 328 329 The function knows how to deal with ",", ";" and "=" as well as quoted 330 values after "=". A list of space separated tokens are parsed as if they 331 were separated by ";". 332 333 If the header_values passed as argument contains multiple values, then they 334 are treated as if they were a single value separated by comma ",". 335 336 This means that this function is useful for parsing header fields that 337 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 338 the requirement for tokens). 339 340 headers = #header 341 header = (token | parameter) *( [";"] (token | parameter)) 342 343 token = 1*<any CHAR except CTLs or separators> 344 separators = "(" | ")" | "<" | ">" | "@" 345 | "," | ";" | ":" | "\" | <"> 346 | "/" | "[" | "]" | "?" | "=" 347 | "{" | "}" | SP | HT 348 349 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 350 qdtext = <any TEXT except <">> 351 quoted-pair = "\" CHAR 352 353 parameter = attribute "=" value 354 attribute = token 355 value = token | quoted-string 356 357 Each header is represented by a list of key/value pairs. The value for a 358 simple token (not part of a parameter) is None. Syntactically incorrect 359 headers will not necessarily be parsed as you would want. 360 361 This is easier to describe with some examples: 362 363 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 364 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 365 >>> split_header_words(['text/html; charset="iso-8859-1"']) 366 [[('text/html', None), ('charset', 'iso-8859-1')]] 367 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 368 [[('Basic', None), ('realm', '"foobar"')]] 369 370 """ 371 assert not isinstance(header_values, basestring) 372 result = [] 373 for text in header_values: 374 orig_text = text 375 pairs = [] 376 while text: 377 m = HEADER_TOKEN_RE.search(text) 378 if m: 379 text = unmatched(m) 380 name = m.group(1) 381 m = HEADER_QUOTED_VALUE_RE.search(text) 382 if m: # quoted value 383 text = unmatched(m) 384 value = m.group(1) 385 value = HEADER_ESCAPE_RE.sub(r"\1", value) 386 else: 387 m = HEADER_VALUE_RE.search(text) 388 if m: # unquoted value 389 text = unmatched(m) 390 value = m.group(1) 391 value = value.rstrip() 392 else: 393 # no value, a lone token 394 value = None 395 pairs.append((name, value)) 396 elif text.lstrip().startswith(","): 397 # concatenated headers, as per RFC 2616 section 4.2 398 text = text.lstrip()[1:] 399 if pairs: result.append(pairs) 400 pairs = [] 401 else: 402 # skip junk 403 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) 404 assert nr_junk_chars > 0, ( 405 "split_header_words bug: '%s', '%s', %s" % 406 (orig_text, text, pairs)) 407 text = non_junk 408 if pairs: result.append(pairs) 409 return result 410 411HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 412def join_header_words(lists): 413 """Do the inverse (almost) of the conversion done by split_header_words. 414 415 Takes a list of lists of (key, value) pairs and produces a single header 416 value. Attribute values are quoted if needed. 417 418 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) 419 'text/plain; charset="iso-8859/1"' 420 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) 421 'text/plain, charset="iso-8859/1"' 422 423 """ 424 headers = [] 425 for pairs in lists: 426 attr = [] 427 for k, v in pairs: 428 if v is not None: 429 if not re.search(r"^\w+$", v): 430 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 431 v = '"%s"' % v 432 k = "%s=%s" % (k, v) 433 attr.append(k) 434 if attr: headers.append("; ".join(attr)) 435 return ", ".join(headers) 436 437def _strip_quotes(text): 438 if text.startswith('"'): 439 text = text[1:] 440 if text.endswith('"'): 441 text = text[:-1] 442 return text 443 444def parse_ns_headers(ns_headers): 445 """Ad-hoc parser for Netscape protocol cookie-attributes. 446 447 The old Netscape cookie format for Set-Cookie can for instance contain 448 an unquoted "," in the expires field, so we have to use this ad-hoc 449 parser instead of split_header_words. 450 451 XXX This may not make the best possible effort to parse all the crap 452 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 453 parser is probably better, so could do worse than following that if 454 this ever gives any trouble. 455 456 Currently, this is also used for parsing RFC 2109 cookies. 457 458 """ 459 known_attrs = ("expires", "domain", "path", "secure", 460 # RFC 2109 attrs (may turn up in Netscape cookies, too) 461 "version", "port", "max-age") 462 463 result = [] 464 for ns_header in ns_headers: 465 pairs = [] 466 version_set = False 467 468 # XXX: The following does not strictly adhere to RFCs in that empty 469 # names and values are legal (the former will only appear once and will 470 # be overwritten if multiple occurrences are present). This is 471 # mostly to deal with backwards compatibility. 472 for ii, param in enumerate(ns_header.split(';')): 473 param = param.strip() 474 475 key, sep, val = param.partition('=') 476 key = key.strip() 477 478 if not key: 479 if ii == 0: 480 break 481 else: 482 continue 483 484 # allow for a distinction between present and empty and missing 485 # altogether 486 val = val.strip() if sep else None 487 488 if ii != 0: 489 lc = key.lower() 490 if lc in known_attrs: 491 key = lc 492 493 if key == "version": 494 # This is an RFC 2109 cookie. 495 if val is not None: 496 val = _strip_quotes(val) 497 version_set = True 498 elif key == "expires": 499 # convert expires date to seconds since epoch 500 if val is not None: 501 val = http2time(_strip_quotes(val)) # None if invalid 502 pairs.append((key, val)) 503 504 if pairs: 505 if not version_set: 506 pairs.append(("version", "0")) 507 result.append(pairs) 508 509 return result 510 511 512IPV4_RE = re.compile(r"\.\d+$") 513def is_HDN(text): 514 """Return True if text is a host domain name.""" 515 # XXX 516 # This may well be wrong. Which RFC is HDN defined in, if any (for 517 # the purposes of RFC 2965)? 518 # For the current implementation, what about IPv6? Remember to look 519 # at other uses of IPV4_RE also, if change this. 520 if IPV4_RE.search(text): 521 return False 522 if text == "": 523 return False 524 if text[0] == "." or text[-1] == ".": 525 return False 526 return True 527 528def domain_match(A, B): 529 """Return True if domain A domain-matches domain B, according to RFC 2965. 530 531 A and B may be host domain names or IP addresses. 532 533 RFC 2965, section 1: 534 535 Host names can be specified either as an IP address or a HDN string. 536 Sometimes we compare one host name with another. (Such comparisons SHALL 537 be case-insensitive.) Host A's name domain-matches host B's if 538 539 * their host name strings string-compare equal; or 540 541 * A is a HDN string and has the form NB, where N is a non-empty 542 name string, B has the form .B', and B' is a HDN string. (So, 543 x.y.com domain-matches .Y.com but not Y.com.) 544 545 Note that domain-match is not a commutative operation: a.b.c.com 546 domain-matches .c.com, but not the reverse. 547 548 """ 549 # Note that, if A or B are IP addresses, the only relevant part of the 550 # definition of the domain-match algorithm is the direct string-compare. 551 A = A.lower() 552 B = B.lower() 553 if A == B: 554 return True 555 if not is_HDN(A): 556 return False 557 i = A.rfind(B) 558 if i == -1 or i == 0: 559 # A does not have form NB, or N is the empty string 560 return False 561 if not B.startswith("."): 562 return False 563 if not is_HDN(B[1:]): 564 return False 565 return True 566 567def liberal_is_HDN(text): 568 """Return True if text is a sort-of-like a host domain name. 569 570 For accepting/blocking domains. 571 572 """ 573 if IPV4_RE.search(text): 574 return False 575 return True 576 577def user_domain_match(A, B): 578 """For blocking/accepting domains. 579 580 A and B may be host domain names or IP addresses. 581 582 """ 583 A = A.lower() 584 B = B.lower() 585 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 586 if A == B: 587 # equal IP addresses 588 return True 589 return False 590 initial_dot = B.startswith(".") 591 if initial_dot and A.endswith(B): 592 return True 593 if not initial_dot and A == B: 594 return True 595 return False 596 597cut_port_re = re.compile(r":\d+$") 598def request_host(request): 599 """Return request-host, as defined by RFC 2965. 600 601 Variation from RFC: returned value is lowercased, for convenient 602 comparison. 603 604 """ 605 url = request.get_full_url() 606 host = urlparse.urlparse(url)[1] 607 if host == "": 608 host = request.get_header("Host", "") 609 610 # remove port, if present 611 host = cut_port_re.sub("", host, 1) 612 return host.lower() 613 614def eff_request_host(request): 615 """Return a tuple (request-host, effective request-host name). 616 617 As defined by RFC 2965, except both are lowercased. 618 619 """ 620 erhn = req_host = request_host(request) 621 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 622 erhn = req_host + ".local" 623 return req_host, erhn 624 625def request_path(request): 626 """Path component of request-URI, as defined by RFC 2965.""" 627 url = request.get_full_url() 628 parts = urlparse.urlsplit(url) 629 path = escape_path(parts.path) 630 if not path.startswith("/"): 631 # fix bad RFC 2396 absoluteURI 632 path = "/" + path 633 return path 634 635def request_port(request): 636 host = request.get_host() 637 i = host.find(':') 638 if i >= 0: 639 port = host[i+1:] 640 try: 641 int(port) 642 except ValueError: 643 _debug("nonnumeric port: '%s'", port) 644 return None 645 else: 646 port = DEFAULT_HTTP_PORT 647 return port 648 649# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 650# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 651HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 652ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 653def uppercase_escaped_char(match): 654 return "%%%s" % match.group(1).upper() 655def escape_path(path): 656 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 657 # There's no knowing what character encoding was used to create URLs 658 # containing %-escapes, but since we have to pick one to escape invalid 659 # path characters, we pick UTF-8, as recommended in the HTML 4.0 660 # specification: 661 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 662 # And here, kind of: draft-fielding-uri-rfc2396bis-03 663 # (And in draft IRI specification: draft-duerst-iri-05) 664 # (And here, for new URI schemes: RFC 2718) 665 if isinstance(path, unicode): 666 path = path.encode("utf-8") 667 path = urllib.quote(path, HTTP_PATH_SAFE) 668 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 669 return path 670 671def reach(h): 672 """Return reach of host h, as defined by RFC 2965, section 1. 673 674 The reach R of a host name H is defined as follows: 675 676 * If 677 678 - H is the host domain name of a host; and, 679 680 - H has the form A.B; and 681 682 - A has no embedded (that is, interior) dots; and 683 684 - B has at least one embedded dot, or B is the string "local". 685 then the reach of H is .B. 686 687 * Otherwise, the reach of H is H. 688 689 >>> reach("www.acme.com") 690 '.acme.com' 691 >>> reach("acme.com") 692 'acme.com' 693 >>> reach("acme.local") 694 '.local' 695 696 """ 697 i = h.find(".") 698 if i >= 0: 699 #a = h[:i] # this line is only here to show what a is 700 b = h[i+1:] 701 i = b.find(".") 702 if is_HDN(h) and (i >= 0 or b == "local"): 703 return "."+b 704 return h 705 706def is_third_party(request): 707 """ 708 709 RFC 2965, section 3.3.6: 710 711 An unverifiable transaction is to a third-party host if its request- 712 host U does not domain-match the reach R of the request-host O in the 713 origin transaction. 714 715 """ 716 req_host = request_host(request) 717 if not domain_match(req_host, reach(request.get_origin_req_host())): 718 return True 719 else: 720 return False 721 722 723class Cookie: 724 """HTTP Cookie. 725 726 This class represents both Netscape and RFC 2965 cookies. 727 728 This is deliberately a very simple class. It just holds attributes. It's 729 possible to construct Cookie instances that don't comply with the cookie 730 standards. CookieJar.make_cookies is the factory function for Cookie 731 objects -- it deals with cookie parsing, supplying defaults, and 732 normalising to the representation used in this class. CookiePolicy is 733 responsible for checking them to see whether they should be accepted from 734 and returned to the server. 735 736 Note that the port may be present in the headers, but unspecified ("Port" 737 rather than"Port=80", for example); if this is the case, port is None. 738 739 """ 740 741 def __init__(self, version, name, value, 742 port, port_specified, 743 domain, domain_specified, domain_initial_dot, 744 path, path_specified, 745 secure, 746 expires, 747 discard, 748 comment, 749 comment_url, 750 rest, 751 rfc2109=False, 752 ): 753 754 if version is not None: version = int(version) 755 if expires is not None: expires = int(expires) 756 if port is None and port_specified is True: 757 raise ValueError("if port is None, port_specified must be false") 758 759 self.version = version 760 self.name = name 761 self.value = value 762 self.port = port 763 self.port_specified = port_specified 764 # normalise case, as per RFC 2965 section 3.3.3 765 self.domain = domain.lower() 766 self.domain_specified = domain_specified 767 # Sigh. We need to know whether the domain given in the 768 # cookie-attribute had an initial dot, in order to follow RFC 2965 769 # (as clarified in draft errata). Needed for the returned $Domain 770 # value. 771 self.domain_initial_dot = domain_initial_dot 772 self.path = path 773 self.path_specified = path_specified 774 self.secure = secure 775 self.expires = expires 776 self.discard = discard 777 self.comment = comment 778 self.comment_url = comment_url 779 self.rfc2109 = rfc2109 780 781 self._rest = copy.copy(rest) 782 783 def has_nonstandard_attr(self, name): 784 return name in self._rest 785 def get_nonstandard_attr(self, name, default=None): 786 return self._rest.get(name, default) 787 def set_nonstandard_attr(self, name, value): 788 self._rest[name] = value 789 790 def is_expired(self, now=None): 791 if now is None: now = time.time() 792 if (self.expires is not None) and (self.expires <= now): 793 return True 794 return False 795 796 def __str__(self): 797 if self.port is None: p = "" 798 else: p = ":"+self.port 799 limit = self.domain + p + self.path 800 if self.value is not None: 801 namevalue = "%s=%s" % (self.name, self.value) 802 else: 803 namevalue = self.name 804 return "<Cookie %s for %s>" % (namevalue, limit) 805 806 def __repr__(self): 807 args = [] 808 for name in ("version", "name", "value", 809 "port", "port_specified", 810 "domain", "domain_specified", "domain_initial_dot", 811 "path", "path_specified", 812 "secure", "expires", "discard", "comment", "comment_url", 813 ): 814 attr = getattr(self, name) 815 args.append("%s=%s" % (name, repr(attr))) 816 args.append("rest=%s" % repr(self._rest)) 817 args.append("rfc2109=%s" % repr(self.rfc2109)) 818 return "Cookie(%s)" % ", ".join(args) 819 820 821class CookiePolicy: 822 """Defines which cookies get accepted from and returned to server. 823 824 May also modify cookies, though this is probably a bad idea. 825 826 The subclass DefaultCookiePolicy defines the standard rules for Netscape 827 and RFC 2965 cookies -- override that if you want a customised policy. 828 829 """ 830 def set_ok(self, cookie, request): 831 """Return true if (and only if) cookie should be accepted from server. 832 833 Currently, pre-expired cookies never get this far -- the CookieJar 834 class deletes such cookies itself. 835 836 """ 837 raise NotImplementedError() 838 839 def return_ok(self, cookie, request): 840 """Return true if (and only if) cookie should be returned to server.""" 841 raise NotImplementedError() 842 843 def domain_return_ok(self, domain, request): 844 """Return false if cookies should not be returned, given cookie domain. 845 """ 846 return True 847 848 def path_return_ok(self, path, request): 849 """Return false if cookies should not be returned, given cookie path. 850 """ 851 return True 852 853 854class DefaultCookiePolicy(CookiePolicy): 855 """Implements the standard rules for accepting and returning cookies.""" 856 857 DomainStrictNoDots = 1 858 DomainStrictNonDomain = 2 859 DomainRFC2965Match = 4 860 861 DomainLiberal = 0 862 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 863 864 def __init__(self, 865 blocked_domains=None, allowed_domains=None, 866 netscape=True, rfc2965=False, 867 rfc2109_as_netscape=None, 868 hide_cookie2=False, 869 strict_domain=False, 870 strict_rfc2965_unverifiable=True, 871 strict_ns_unverifiable=False, 872 strict_ns_domain=DomainLiberal, 873 strict_ns_set_initial_dollar=False, 874 strict_ns_set_path=False, 875 ): 876 """Constructor arguments should be passed as keyword arguments only.""" 877 self.netscape = netscape 878 self.rfc2965 = rfc2965 879 self.rfc2109_as_netscape = rfc2109_as_netscape 880 self.hide_cookie2 = hide_cookie2 881 self.strict_domain = strict_domain 882 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 883 self.strict_ns_unverifiable = strict_ns_unverifiable 884 self.strict_ns_domain = strict_ns_domain 885 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 886 self.strict_ns_set_path = strict_ns_set_path 887 888 if blocked_domains is not None: 889 self._blocked_domains = tuple(blocked_domains) 890 else: 891 self._blocked_domains = () 892 893 if allowed_domains is not None: 894 allowed_domains = tuple(allowed_domains) 895 self._allowed_domains = allowed_domains 896 897 def blocked_domains(self): 898 """Return the sequence of blocked domains (as a tuple).""" 899 return self._blocked_domains 900 def set_blocked_domains(self, blocked_domains): 901 """Set the sequence of blocked domains.""" 902 self._blocked_domains = tuple(blocked_domains) 903 904 def is_blocked(self, domain): 905 for blocked_domain in self._blocked_domains: 906 if user_domain_match(domain, blocked_domain): 907 return True 908 return False 909 910 def allowed_domains(self): 911 """Return None, or the sequence of allowed domains (as a tuple).""" 912 return self._allowed_domains 913 def set_allowed_domains(self, allowed_domains): 914 """Set the sequence of allowed domains, or None.""" 915 if allowed_domains is not None: 916 allowed_domains = tuple(allowed_domains) 917 self._allowed_domains = allowed_domains 918 919 def is_not_allowed(self, domain): 920 if self._allowed_domains is None: 921 return False 922 for allowed_domain in self._allowed_domains: 923 if user_domain_match(domain, allowed_domain): 924 return False 925 return True 926 927 def set_ok(self, cookie, request): 928 """ 929 If you override .set_ok(), be sure to call this method. If it returns 930 false, so should your subclass (assuming your subclass wants to be more 931 strict about which cookies to accept). 932 933 """ 934 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 935 936 assert cookie.name is not None 937 938 for n in "version", "verifiability", "name", "path", "domain", "port": 939 fn_name = "set_ok_"+n 940 fn = getattr(self, fn_name) 941 if not fn(cookie, request): 942 return False 943 944 return True 945 946 def set_ok_version(self, cookie, request): 947 if cookie.version is None: 948 # Version is always set to 0 by parse_ns_headers if it's a Netscape 949 # cookie, so this must be an invalid RFC 2965 cookie. 950 _debug(" Set-Cookie2 without version attribute (%s=%s)", 951 cookie.name, cookie.value) 952 return False 953 if cookie.version > 0 and not self.rfc2965: 954 _debug(" RFC 2965 cookies are switched off") 955 return False 956 elif cookie.version == 0 and not self.netscape: 957 _debug(" Netscape cookies are switched off") 958 return False 959 return True 960 961 def set_ok_verifiability(self, cookie, request): 962 if request.is_unverifiable() and is_third_party(request): 963 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 964 _debug(" third-party RFC 2965 cookie during " 965 "unverifiable transaction") 966 return False 967 elif cookie.version == 0 and self.strict_ns_unverifiable: 968 _debug(" third-party Netscape cookie during " 969 "unverifiable transaction") 970 return False 971 return True 972 973 def set_ok_name(self, cookie, request): 974 # Try and stop servers setting V0 cookies designed to hack other 975 # servers that know both V0 and V1 protocols. 976 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 977 cookie.name.startswith("$")): 978 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 979 return False 980 return True 981 982 def set_ok_path(self, cookie, request): 983 if cookie.path_specified: 984 req_path = request_path(request) 985 if ((cookie.version > 0 or 986 (cookie.version == 0 and self.strict_ns_set_path)) and 987 not req_path.startswith(cookie.path)): 988 _debug(" path attribute %s is not a prefix of request " 989 "path %s", cookie.path, req_path) 990 return False 991 return True 992 993 def set_ok_domain(self, cookie, request): 994 if self.is_blocked(cookie.domain): 995 _debug(" domain %s is in user block-list", cookie.domain) 996 return False 997 if self.is_not_allowed(cookie.domain): 998 _debug(" domain %s is not in user allow-list", cookie.domain) 999 return False 1000 if cookie.domain_specified: 1001 req_host, erhn = eff_request_host(request) 1002 domain = cookie.domain 1003 if self.strict_domain and (domain.count(".") >= 2): 1004 # XXX This should probably be compared with the Konqueror 1005 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1006 # losing battle. 1007 i = domain.rfind(".") 1008 j = domain.rfind(".", 0, i) 1009 if j == 0: # domain like .foo.bar 1010 tld = domain[i+1:] 1011 sld = domain[j+1:i] 1012 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1013 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1014 "info", "jobs", "mobi", "museum", "name", "pro", 1015 "travel", "eu") and len(tld) == 2: 1016 # domain like .co.uk 1017 _debug(" country-code second level domain %s", domain) 1018 return False 1019 if domain.startswith("."): 1020 undotted_domain = domain[1:] 1021 else: 1022 undotted_domain = domain 1023 embedded_dots = (undotted_domain.find(".") >= 0) 1024 if not embedded_dots and domain != ".local": 1025 _debug(" non-local domain %s contains no embedded dot", 1026 domain) 1027 return False 1028 if cookie.version == 0: 1029 if (not erhn.endswith(domain) and 1030 (not erhn.startswith(".") and 1031 not ("."+erhn).endswith(domain))): 1032 _debug(" effective request-host %s (even with added " 1033 "initial dot) does not end with %s", 1034 erhn, domain) 1035 return False 1036 if (cookie.version > 0 or 1037 (self.strict_ns_domain & self.DomainRFC2965Match)): 1038 if not domain_match(erhn, domain): 1039 _debug(" effective request-host %s does not domain-match " 1040 "%s", erhn, domain) 1041 return False 1042 if (cookie.version > 0 or 1043 (self.strict_ns_domain & self.DomainStrictNoDots)): 1044 host_prefix = req_host[:-len(domain)] 1045 if (host_prefix.find(".") >= 0 and 1046 not IPV4_RE.search(req_host)): 1047 _debug(" host prefix %s for domain %s contains a dot", 1048 host_prefix, domain) 1049 return False 1050 return True 1051 1052 def set_ok_port(self, cookie, request): 1053 if cookie.port_specified: 1054 req_port = request_port(request) 1055 if req_port is None: 1056 req_port = "80" 1057 else: 1058 req_port = str(req_port) 1059 for p in cookie.port.split(","): 1060 try: 1061 int(p) 1062 except ValueError: 1063 _debug(" bad port %s (not numeric)", p) 1064 return False 1065 if p == req_port: 1066 break 1067 else: 1068 _debug(" request port (%s) not found in %s", 1069 req_port, cookie.port) 1070 return False 1071 return True 1072 1073 def return_ok(self, cookie, request): 1074 """ 1075 If you override .return_ok(), be sure to call this method. If it 1076 returns false, so should your subclass (assuming your subclass wants to 1077 be more strict about which cookies to return). 1078 1079 """ 1080 # Path has already been checked by .path_return_ok(), and domain 1081 # blocking done by .domain_return_ok(). 1082 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1083 1084 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1085 fn_name = "return_ok_"+n 1086 fn = getattr(self, fn_name) 1087 if not fn(cookie, request): 1088 return False 1089 return True 1090 1091 def return_ok_version(self, cookie, request): 1092 if cookie.version > 0 and not self.rfc2965: 1093 _debug(" RFC 2965 cookies are switched off") 1094 return False 1095 elif cookie.version == 0 and not self.netscape: 1096 _debug(" Netscape cookies are switched off") 1097 return False 1098 return True 1099 1100 def return_ok_verifiability(self, cookie, request): 1101 if request.is_unverifiable() and is_third_party(request): 1102 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1103 _debug(" third-party RFC 2965 cookie during unverifiable " 1104 "transaction") 1105 return False 1106 elif cookie.version == 0 and self.strict_ns_unverifiable: 1107 _debug(" third-party Netscape cookie during unverifiable " 1108 "transaction") 1109 return False 1110 return True 1111 1112 def return_ok_secure(self, cookie, request): 1113 if cookie.secure and request.get_type() != "https": 1114 _debug(" secure cookie with non-secure request") 1115 return False 1116 return True 1117 1118 def return_ok_expires(self, cookie, request): 1119 if cookie.is_expired(self._now): 1120 _debug(" cookie expired") 1121 return False 1122 return True 1123 1124 def return_ok_port(self, cookie, request): 1125 if cookie.port: 1126 req_port = request_port(request) 1127 if req_port is None: 1128 req_port = "80" 1129 for p in cookie.port.split(","): 1130 if p == req_port: 1131 break 1132 else: 1133 _debug(" request port %s does not match cookie port %s", 1134 req_port, cookie.port) 1135 return False 1136 return True 1137 1138 def return_ok_domain(self, cookie, request): 1139 req_host, erhn = eff_request_host(request) 1140 domain = cookie.domain 1141 1142 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1143 if (cookie.version == 0 and 1144 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1145 not cookie.domain_specified and domain != erhn): 1146 _debug(" cookie with unspecified domain does not string-compare " 1147 "equal to request domain") 1148 return False 1149 1150 if cookie.version > 0 and not domain_match(erhn, domain): 1151 _debug(" effective request-host name %s does not domain-match " 1152 "RFC 2965 cookie domain %s", erhn, domain) 1153 return False 1154 if cookie.version == 0 and not ("."+erhn).endswith(domain): 1155 _debug(" request-host %s does not match Netscape cookie domain " 1156 "%s", req_host, domain) 1157 return False 1158 return True 1159 1160 def domain_return_ok(self, domain, request): 1161 # Liberal check of. This is here as an optimization to avoid 1162 # having to load lots of MSIE cookie files unless necessary. 1163 req_host, erhn = eff_request_host(request) 1164 if not req_host.startswith("."): 1165 req_host = "."+req_host 1166 if not erhn.startswith("."): 1167 erhn = "."+erhn 1168 if not (req_host.endswith(domain) or erhn.endswith(domain)): 1169 #_debug(" request domain %s does not match cookie domain %s", 1170 # req_host, domain) 1171 return False 1172 1173 if self.is_blocked(domain): 1174 _debug(" domain %s is in user block-list", domain) 1175 return False 1176 if self.is_not_allowed(domain): 1177 _debug(" domain %s is not in user allow-list", domain) 1178 return False 1179 1180 return True 1181 1182 def path_return_ok(self, path, request): 1183 _debug("- checking cookie path=%s", path) 1184 req_path = request_path(request) 1185 if not req_path.startswith(path): 1186 _debug(" %s does not path-match %s", req_path, path) 1187 return False 1188 return True 1189 1190 1191def vals_sorted_by_key(adict): 1192 keys = adict.keys() 1193 keys.sort() 1194 return map(adict.get, keys) 1195 1196def deepvalues(mapping): 1197 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1198 values = vals_sorted_by_key(mapping) 1199 for obj in values: 1200 mapping = False 1201 try: 1202 obj.items 1203 except AttributeError: 1204 pass 1205 else: 1206 mapping = True 1207 for subobj in deepvalues(obj): 1208 yield subobj 1209 if not mapping: 1210 yield obj 1211 1212 1213# Used as second parameter to dict.get() method, to distinguish absent 1214# dict key from one with a None value. 1215class Absent: pass 1216 1217class CookieJar: 1218 """Collection of HTTP cookies. 1219 1220 You may not need to know about this class: try 1221 urllib2.build_opener(HTTPCookieProcessor).open(url). 1222 1223 """ 1224 1225 non_word_re = re.compile(r"\W") 1226 quote_re = re.compile(r"([\"\\])") 1227 strict_domain_re = re.compile(r"\.?[^.]*") 1228 domain_re = re.compile(r"[^.]*") 1229 dots_re = re.compile(r"^\.+") 1230 1231 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" 1232 1233 def __init__(self, policy=None): 1234 if policy is None: 1235 policy = DefaultCookiePolicy() 1236 self._policy = policy 1237 1238 self._cookies_lock = _threading.RLock() 1239 self._cookies = {} 1240 1241 def set_policy(self, policy): 1242 self._policy = policy 1243 1244 def _cookies_for_domain(self, domain, request): 1245 cookies = [] 1246 if not self._policy.domain_return_ok(domain, request): 1247 return [] 1248 _debug("Checking %s for cookies to return", domain) 1249 cookies_by_path = self._cookies[domain] 1250 for path in cookies_by_path.keys(): 1251 if not self._policy.path_return_ok(path, request): 1252 continue 1253 cookies_by_name = cookies_by_path[path] 1254 for cookie in cookies_by_name.values(): 1255 if not self._policy.return_ok(cookie, request): 1256 _debug(" not returning cookie") 1257 continue 1258 _debug(" it's a match") 1259 cookies.append(cookie) 1260 return cookies 1261 1262 def _cookies_for_request(self, request): 1263 """Return a list of cookies to be returned to server.""" 1264 cookies = [] 1265 for domain in self._cookies.keys(): 1266 cookies.extend(self._cookies_for_domain(domain, request)) 1267 return cookies 1268 1269 def _cookie_attrs(self, cookies): 1270 """Return a list of cookie-attributes to be returned to server. 1271 1272 like ['foo="bar"; $Path="/"', ...] 1273 1274 The $Version attribute is also added when appropriate (currently only 1275 once per request). 1276 1277 """ 1278 # add cookies in order of most specific (ie. longest) path first 1279 cookies.sort(key=lambda arg: len(arg.path), reverse=True) 1280 1281 version_set = False 1282 1283 attrs = [] 1284 for cookie in cookies: 1285 # set version of Cookie header 1286 # XXX 1287 # What should it be if multiple matching Set-Cookie headers have 1288 # different versions themselves? 1289 # Answer: there is no answer; was supposed to be settled by 1290 # RFC 2965 errata, but that may never appear... 1291 version = cookie.version 1292 if not version_set: 1293 version_set = True 1294 if version > 0: 1295 attrs.append("$Version=%s" % version) 1296 1297 # quote cookie value if necessary 1298 # (not for Netscape protocol, which already has any quotes 1299 # intact, due to the poorly-specified Netscape Cookie: syntax) 1300 if ((cookie.value is not None) and 1301 self.non_word_re.search(cookie.value) and version > 0): 1302 value = self.quote_re.sub(r"\\\1", cookie.value) 1303 else: 1304 value = cookie.value 1305 1306 # add cookie-attributes to be returned in Cookie header 1307 if cookie.value is None: 1308 attrs.append(cookie.name) 1309 else: 1310 attrs.append("%s=%s" % (cookie.name, value)) 1311 if version > 0: 1312 if cookie.path_specified: 1313 attrs.append('$Path="%s"' % cookie.path) 1314 if cookie.domain.startswith("."): 1315 domain = cookie.domain 1316 if (not cookie.domain_initial_dot and 1317 domain.startswith(".")): 1318 domain = domain[1:] 1319 attrs.append('$Domain="%s"' % domain) 1320 if cookie.port is not None: 1321 p = "$Port" 1322 if cookie.port_specified: 1323 p = p + ('="%s"' % cookie.port) 1324 attrs.append(p) 1325 1326 return attrs 1327 1328 def add_cookie_header(self, request): 1329 """Add correct Cookie: header to request (urllib2.Request object). 1330 1331 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1332 1333 """ 1334 _debug("add_cookie_header") 1335 self._cookies_lock.acquire() 1336 try: 1337 1338 self._policy._now = self._now = int(time.time()) 1339 1340 cookies = self._cookies_for_request(request) 1341 1342 attrs = self._cookie_attrs(cookies) 1343 if attrs: 1344 if not request.has_header("Cookie"): 1345 request.add_unredirected_header( 1346 "Cookie", "; ".join(attrs)) 1347 1348 # if necessary, advertise that we know RFC 2965 1349 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1350 not request.has_header("Cookie2")): 1351 for cookie in cookies: 1352 if cookie.version != 1: 1353 request.add_unredirected_header("Cookie2", '$Version="1"') 1354 break 1355 1356 finally: 1357 self._cookies_lock.release() 1358 1359 self.clear_expired_cookies() 1360 1361 def _normalized_cookie_tuples(self, attrs_set): 1362 """Return list of tuples containing normalised cookie information. 1363 1364 attrs_set is the list of lists of key,value pairs extracted from 1365 the Set-Cookie or Set-Cookie2 headers. 1366 1367 Tuples are name, value, standard, rest, where name and value are the 1368 cookie name and value, standard is a dictionary containing the standard 1369 cookie-attributes (discard, secure, version, expires or max-age, 1370 domain, path and port) and rest is a dictionary containing the rest of 1371 the cookie-attributes. 1372 1373 """ 1374 cookie_tuples = [] 1375 1376 boolean_attrs = "discard", "secure" 1377 value_attrs = ("version", 1378 "expires", "max-age", 1379 "domain", "path", "port", 1380 "comment", "commenturl") 1381 1382 for cookie_attrs in attrs_set: 1383 name, value = cookie_attrs[0] 1384 1385 # Build dictionary of standard cookie-attributes (standard) and 1386 # dictionary of other cookie-attributes (rest). 1387 1388 # Note: expiry time is normalised to seconds since epoch. V0 1389 # cookies should have the Expires cookie-attribute, and V1 cookies 1390 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1391 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1392 # accept either (but prefer Max-Age). 1393 max_age_set = False 1394 1395 bad_cookie = False 1396 1397 standard = {} 1398 rest = {} 1399 for k, v in cookie_attrs[1:]: 1400 lc = k.lower() 1401 # don't lose case distinction for unknown fields 1402 if lc in value_attrs or lc in boolean_attrs: 1403 k = lc 1404 if k in boolean_attrs and v is None: 1405 # boolean cookie-attribute is present, but has no value 1406 # (like "discard", rather than "port=80") 1407 v = True 1408 if k in standard: 1409 # only first value is significant 1410 continue 1411 if k == "domain": 1412 if v is None: 1413 _debug(" missing value for domain attribute") 1414 bad_cookie = True 1415 break 1416 # RFC 2965 section 3.3.3 1417 v = v.lower() 1418 if k == "expires": 1419 if max_age_set: 1420 # Prefer max-age to expires (like Mozilla) 1421 continue 1422 if v is None: 1423 _debug(" missing or invalid value for expires " 1424 "attribute: treating as session cookie") 1425 continue 1426 if k == "max-age": 1427 max_age_set = True 1428 try: 1429 v = int(v) 1430 except ValueError: 1431 _debug(" missing or invalid (non-numeric) value for " 1432 "max-age attribute") 1433 bad_cookie = True 1434 break 1435 # convert RFC 2965 Max-Age to seconds since epoch 1436 # XXX Strictly you're supposed to follow RFC 2616 1437 # age-calculation rules. Remember that zero Max-Age 1438 # is a request to discard (old and new) cookie, though. 1439 k = "expires" 1440 v = self._now + v 1441 if (k in value_attrs) or (k in boolean_attrs): 1442 if (v is None and 1443 k not in ("port", "comment", "commenturl")): 1444 _debug(" missing value for %s attribute" % k) 1445 bad_cookie = True 1446 break 1447 standard[k] = v 1448 else: 1449 rest[k] = v 1450 1451 if bad_cookie: 1452 continue 1453 1454 cookie_tuples.append((name, value, standard, rest)) 1455 1456 return cookie_tuples 1457 1458 def _cookie_from_cookie_tuple(self, tup, request): 1459 # standard is dict of standard cookie-attributes, rest is dict of the 1460 # rest of them 1461 name, value, standard, rest = tup 1462 1463 domain = standard.get("domain", Absent) 1464 path = standard.get("path", Absent) 1465 port = standard.get("port", Absent) 1466 expires = standard.get("expires", Absent) 1467 1468 # set the easy defaults 1469 version = standard.get("version", None) 1470 if version is not None: 1471 try: 1472 version = int(version) 1473 except ValueError: 1474 return None # invalid version, ignore cookie 1475 secure = standard.get("secure", False) 1476 # (discard is also set if expires is Absent) 1477 discard = standard.get("discard", False) 1478 comment = standard.get("comment", None) 1479 comment_url = standard.get("commenturl", None) 1480 1481 # set default path 1482 if path is not Absent and path != "": 1483 path_specified = True 1484 path = escape_path(path) 1485 else: 1486 path_specified = False 1487 path = request_path(request) 1488 i = path.rfind("/") 1489 if i != -1: 1490 if version == 0: 1491 # Netscape spec parts company from reality here 1492 path = path[:i] 1493 else: 1494 path = path[:i+1] 1495 if len(path) == 0: path = "/" 1496 1497 # set default domain 1498 domain_specified = domain is not Absent 1499 # but first we have to remember whether it starts with a dot 1500 domain_initial_dot = False 1501 if domain_specified: 1502 domain_initial_dot = bool(domain.startswith(".")) 1503 if domain is Absent: 1504 req_host, erhn = eff_request_host(request) 1505 domain = erhn 1506 elif not domain.startswith("."): 1507 domain = "."+domain 1508 1509 # set default port 1510 port_specified = False 1511 if port is not Absent: 1512 if port is None: 1513 # Port attr present, but has no value: default to request port. 1514 # Cookie should then only be sent back on that port. 1515 port = request_port(request) 1516 else: 1517 port_specified = True 1518 port = re.sub(r"\s+", "", port) 1519 else: 1520 # No port attr present. Cookie can be sent back on any port. 1521 port = None 1522 1523 # set default expires and discard 1524 if expires is Absent: 1525 expires = None 1526 discard = True 1527 elif expires <= self._now: 1528 # Expiry date in past is request to delete cookie. This can't be 1529 # in DefaultCookiePolicy, because can't delete cookies there. 1530 try: 1531 self.clear(domain, path, name) 1532 except KeyError: 1533 pass 1534 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1535 domain, path, name) 1536 return None 1537 1538 return Cookie(version, 1539 name, value, 1540 port, port_specified, 1541 domain, domain_specified, domain_initial_dot, 1542 path, path_specified, 1543 secure, 1544 expires, 1545 discard, 1546 comment, 1547 comment_url, 1548 rest) 1549 1550 def _cookies_from_attrs_set(self, attrs_set, request): 1551 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1552 1553 cookies = [] 1554 for tup in cookie_tuples: 1555 cookie = self._cookie_from_cookie_tuple(tup, request) 1556 if cookie: cookies.append(cookie) 1557 return cookies 1558 1559 def _process_rfc2109_cookies(self, cookies): 1560 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1561 if rfc2109_as_ns is None: 1562 rfc2109_as_ns = not self._policy.rfc2965 1563 for cookie in cookies: 1564 if cookie.version == 1: 1565 cookie.rfc2109 = True 1566 if rfc2109_as_ns: 1567 # treat 2109 cookies as Netscape cookies rather than 1568 # as RFC2965 cookies 1569 cookie.version = 0 1570 1571 def make_cookies(self, response, request): 1572 """Return sequence of Cookie objects extracted from response object.""" 1573 # get cookie-attributes for RFC 2965 and Netscape protocols 1574 headers = response.info() 1575 rfc2965_hdrs = headers.getheaders("Set-Cookie2") 1576 ns_hdrs = headers.getheaders("Set-Cookie") 1577 1578 rfc2965 = self._policy.rfc2965 1579 netscape = self._policy.netscape 1580 1581 if ((not rfc2965_hdrs and not ns_hdrs) or 1582 (not ns_hdrs and not rfc2965) or 1583 (not rfc2965_hdrs and not netscape) or 1584 (not netscape and not rfc2965)): 1585 return [] # no relevant cookie headers: quick exit 1586 1587 try: 1588 cookies = self._cookies_from_attrs_set( 1589 split_header_words(rfc2965_hdrs), request) 1590 except Exception: 1591 _warn_unhandled_exception() 1592 cookies = [] 1593 1594 if ns_hdrs and netscape: 1595 try: 1596 # RFC 2109 and Netscape cookies 1597 ns_cookies = self._cookies_from_attrs_set( 1598 parse_ns_headers(ns_hdrs), request) 1599 except Exception: 1600 _warn_unhandled_exception() 1601 ns_cookies = [] 1602 self._process_rfc2109_cookies(ns_cookies) 1603 1604 # Look for Netscape cookies (from Set-Cookie headers) that match 1605 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1606 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1607 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1608 # bundled in with the Netscape cookies for this purpose, which is 1609 # reasonable behaviour. 1610 if rfc2965: 1611 lookup = {} 1612 for cookie in cookies: 1613 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1614 1615 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1616 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1617 return key not in lookup 1618 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1619 1620 if ns_cookies: 1621 cookies.extend(ns_cookies) 1622 1623 return cookies 1624 1625 def set_cookie_if_ok(self, cookie, request): 1626 """Set a cookie if policy says it's OK to do so.""" 1627 self._cookies_lock.acquire() 1628 try: 1629 self._policy._now = self._now = int(time.time()) 1630 1631 if self._policy.set_ok(cookie, request): 1632 self.set_cookie(cookie) 1633 1634 1635 finally: 1636 self._cookies_lock.release() 1637 1638 def set_cookie(self, cookie): 1639 """Set a cookie, without checking whether or not it should be set.""" 1640 c = self._cookies 1641 self._cookies_lock.acquire() 1642 try: 1643 if cookie.domain not in c: c[cookie.domain] = {} 1644 c2 = c[cookie.domain] 1645 if cookie.path not in c2: c2[cookie.path] = {} 1646 c3 = c2[cookie.path] 1647 c3[cookie.name] = cookie 1648 finally: 1649 self._cookies_lock.release() 1650 1651 def extract_cookies(self, response, request): 1652 """Extract cookies from response, where allowable given the request.""" 1653 _debug("extract_cookies: %s", response.info()) 1654 self._cookies_lock.acquire() 1655 try: 1656 self._policy._now = self._now = int(time.time()) 1657 1658 for cookie in self.make_cookies(response, request): 1659 if self._policy.set_ok(cookie, request): 1660 _debug(" setting cookie: %s", cookie) 1661 self.set_cookie(cookie) 1662 finally: 1663 self._cookies_lock.release() 1664 1665 def clear(self, domain=None, path=None, name=None): 1666 """Clear some cookies. 1667 1668 Invoking this method without arguments will clear all cookies. If 1669 given a single argument, only cookies belonging to that domain will be 1670 removed. If given two arguments, cookies belonging to the specified 1671 path within that domain are removed. If given three arguments, then 1672 the cookie with the specified name, path and domain is removed. 1673 1674 Raises KeyError if no matching cookie exists. 1675 1676 """ 1677 if name is not None: 1678 if (domain is None) or (path is None): 1679 raise ValueError( 1680 "domain and path must be given to remove a cookie by name") 1681 del self._cookies[domain][path][name] 1682 elif path is not None: 1683 if domain is None: 1684 raise ValueError( 1685 "domain must be given to remove cookies by path") 1686 del self._cookies[domain][path] 1687 elif domain is not None: 1688 del self._cookies[domain] 1689 else: 1690 self._cookies = {} 1691 1692 def clear_session_cookies(self): 1693 """Discard all session cookies. 1694 1695 Note that the .save() method won't save session cookies anyway, unless 1696 you ask otherwise by passing a true ignore_discard argument. 1697 1698 """ 1699 self._cookies_lock.acquire() 1700 try: 1701 for cookie in self: 1702 if cookie.discard: 1703 self.clear(cookie.domain, cookie.path, cookie.name) 1704 finally: 1705 self._cookies_lock.release() 1706 1707 def clear_expired_cookies(self): 1708 """Discard all expired cookies. 1709 1710 You probably don't need to call this method: expired cookies are never 1711 sent back to the server (provided you're using DefaultCookiePolicy), 1712 this method is called by CookieJar itself every so often, and the 1713 .save() method won't save expired cookies anyway (unless you ask 1714 otherwise by passing a true ignore_expires argument). 1715 1716 """ 1717 self._cookies_lock.acquire() 1718 try: 1719 now = time.time() 1720 for cookie in self: 1721 if cookie.is_expired(now): 1722 self.clear(cookie.domain, cookie.path, cookie.name) 1723 finally: 1724 self._cookies_lock.release() 1725 1726 def __iter__(self): 1727 return deepvalues(self._cookies) 1728 1729 def __len__(self): 1730 """Return number of contained cookies.""" 1731 i = 0 1732 for cookie in self: i = i + 1 1733 return i 1734 1735 def __repr__(self): 1736 r = [] 1737 for cookie in self: r.append(repr(cookie)) 1738 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1739 1740 def __str__(self): 1741 r = [] 1742 for cookie in self: r.append(str(cookie)) 1743 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1744 1745 1746# derives from IOError for backwards-compatibility with Python 2.4.0 1747class LoadError(IOError): pass 1748 1749class FileCookieJar(CookieJar): 1750 """CookieJar that can be loaded from and saved to a file.""" 1751 1752 def __init__(self, filename=None, delayload=False, policy=None): 1753 """ 1754 Cookies are NOT loaded from the named file until either the .load() or 1755 .revert() method is called. 1756 1757 """ 1758 CookieJar.__init__(self, policy) 1759 if filename is not None: 1760 try: 1761 filename+"" 1762 except: 1763 raise ValueError("filename must be string-like") 1764 self.filename = filename 1765 self.delayload = bool(delayload) 1766 1767 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1768 """Save cookies to a file.""" 1769 raise NotImplementedError() 1770 1771 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1772 """Load cookies from a file.""" 1773 if filename is None: 1774 if self.filename is not None: filename = self.filename 1775 else: raise ValueError(MISSING_FILENAME_TEXT) 1776 1777 f = open(filename) 1778 try: 1779 self._really_load(f, filename, ignore_discard, ignore_expires) 1780 finally: 1781 f.close() 1782 1783 def revert(self, filename=None, 1784 ignore_discard=False, ignore_expires=False): 1785 """Clear all cookies and reload cookies from a saved file. 1786 1787 Raises LoadError (or IOError) if reversion is not successful; the 1788 object's state will not be altered if this happens. 1789 1790 """ 1791 if filename is None: 1792 if self.filename is not None: filename = self.filename 1793 else: raise ValueError(MISSING_FILENAME_TEXT) 1794 1795 self._cookies_lock.acquire() 1796 try: 1797 1798 old_state = copy.deepcopy(self._cookies) 1799 self._cookies = {} 1800 try: 1801 self.load(filename, ignore_discard, ignore_expires) 1802 except (LoadError, IOError): 1803 self._cookies = old_state 1804 raise 1805 1806 finally: 1807 self._cookies_lock.release() 1808 1809from _LWPCookieJar import LWPCookieJar, lwp_cookie_str 1810from _MozillaCookieJar import MozillaCookieJar 1811