1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] 30 31import copy 32import datetime 33import re 34import time 35import urllib.parse, urllib.request 36import threading as _threading 37import http.client # only for the default HTTP port 38from calendar import timegm 39 40debug = False # set to True to enable debugging via the logging module 41logger = None 42 43def _debug(*args): 44 if not debug: 45 return 46 global logger 47 if not logger: 48 import logging 49 logger = logging.getLogger("http.cookiejar") 50 return logger.debug(*args) 51 52 53DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) 54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 55 "instance initialised with one)") 56 57def _warn_unhandled_exception(): 58 # There are a few catch-all except: statements in this module, for 59 # catching input that's bad in unexpected ways. Warn if any 60 # exceptions are caught there. 61 import io, warnings, traceback 62 f = io.StringIO() 63 traceback.print_exc(None, f) 64 msg = f.getvalue() 65 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) 66 67 68# Date/time conversion 69# ----------------------------------------------------------------------------- 70 71EPOCH_YEAR = 1970 72def _timegm(tt): 73 year, month, mday, hour, min, sec = tt[:6] 74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 76 return timegm(tt) 77 else: 78 return None 79 80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 83MONTHS_LOWER = [] 84for month in MONTHS: MONTHS_LOWER.append(month.lower()) 85 86def time2isoz(t=None): 87 """Return a string representing time in seconds since epoch, t. 88 89 If the function is called without an argument, it will use the current 90 time. 91 92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 93 representing Universal Time (UTC, aka GMT). An example of this format is: 94 95 1994-11-24 08:49:37Z 96 97 """ 98 if t is None: 99 dt = datetime.datetime.utcnow() 100 else: 101 dt = datetime.datetime.utcfromtimestamp(t) 102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) 104 105def time2netscape(t=None): 106 """Return a string representing time in seconds since epoch, t. 107 108 If the function is called without an argument, it will use the current 109 time. 110 111 The format of the returned string is like this: 112 113 Wed, DD-Mon-YYYY HH:MM:SS GMT 114 115 """ 116 if t is None: 117 dt = datetime.datetime.utcnow() 118 else: 119 dt = datetime.datetime.utcfromtimestamp(t) 120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], 122 dt.year, dt.hour, dt.minute, dt.second) 123 124 125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 126 127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) 128def offset_from_tz_string(tz): 129 offset = None 130 if tz in UTC_ZONES: 131 offset = 0 132 else: 133 m = TIMEZONE_RE.search(tz) 134 if m: 135 offset = 3600 * int(m.group(2)) 136 if m.group(3): 137 offset = offset + 60 * int(m.group(3)) 138 if m.group(1) == '-': 139 offset = -offset 140 return offset 141 142def _str2time(day, mon, yr, hr, min, sec, tz): 143 yr = int(yr) 144 if yr > datetime.MAXYEAR: 145 return None 146 147 # translate month name to number 148 # month numbers start with 1 (January) 149 try: 150 mon = MONTHS_LOWER.index(mon.lower())+1 151 except ValueError: 152 # maybe it's already a number 153 try: 154 imon = int(mon) 155 except ValueError: 156 return None 157 if 1 <= imon <= 12: 158 mon = imon 159 else: 160 return None 161 162 # make sure clock elements are defined 163 if hr is None: hr = 0 164 if min is None: min = 0 165 if sec is None: sec = 0 166 167 day = int(day) 168 hr = int(hr) 169 min = int(min) 170 sec = int(sec) 171 172 if yr < 1000: 173 # find "obvious" year 174 cur_yr = time.localtime(time.time())[0] 175 m = cur_yr % 100 176 tmp = yr 177 yr = yr + cur_yr - m 178 m = m - tmp 179 if abs(m) > 50: 180 if m > 0: yr = yr + 100 181 else: yr = yr - 100 182 183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 184 t = _timegm((yr, mon, day, hr, min, sec, tz)) 185 186 if t is not None: 187 # adjust time using timezone string, to get absolute time since epoch 188 if tz is None: 189 tz = "UTC" 190 tz = tz.upper() 191 offset = offset_from_tz_string(tz) 192 if offset is None: 193 return None 194 t = t - offset 195 196 return t 197 198STRICT_DATE_RE = re.compile( 199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) 201WEEKDAY_RE = re.compile( 202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) 203LOOSE_HTTP_DATE_RE = re.compile( 204 r"""^ 205 (\d\d?) # day 206 (?:\s+|[-\/]) 207 (\w+) # month 208 (?:\s+|[-\/]) 209 (\d+) # year 210 (?: 211 (?:\s+|:) # separator before clock 212 (\d\d?):(\d\d) # hour:min 213 (?::(\d\d))? # optional seconds 214 )? # optional clock 215 \s* 216 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone 217 \s* 218 (?:\(\w+\))? # ASCII representation of timezone in parens. 219 \s*$""", re.X | re.ASCII) 220def http2time(text): 221 """Returns time in seconds since epoch of time represented by a string. 222 223 Return value is an integer. 224 225 None is returned if the format of str is unrecognized, the time is outside 226 the representable range, or the timezone string is not recognized. If the 227 string contains no timezone, UTC is assumed. 228 229 The timezone in the string may be numerical (like "-0800" or "+0100") or a 230 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 231 timezone strings equivalent to UTC (zero offset) are known to the function. 232 233 The function loosely parses the following formats: 234 235 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 236 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 237 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 238 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 239 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 240 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 241 242 The parser ignores leading and trailing whitespace. The time may be 243 absent. 244 245 If the year is given with only 2 digits, the function will select the 246 century that makes the year closest to the current date. 247 248 """ 249 # fast exit for strictly conforming string 250 m = STRICT_DATE_RE.search(text) 251 if m: 252 g = m.groups() 253 mon = MONTHS_LOWER.index(g[1].lower()) + 1 254 tt = (int(g[2]), mon, int(g[0]), 255 int(g[3]), int(g[4]), float(g[5])) 256 return _timegm(tt) 257 258 # No, we need some messy parsing... 259 260 # clean up 261 text = text.lstrip() 262 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 263 264 # tz is time zone specifier string 265 day, mon, yr, hr, min, sec, tz = [None]*7 266 267 # loose regexp parse 268 m = LOOSE_HTTP_DATE_RE.search(text) 269 if m is not None: 270 day, mon, yr, hr, min, sec, tz = m.groups() 271 else: 272 return None # bad format 273 274 return _str2time(day, mon, yr, hr, min, sec, tz) 275 276ISO_DATE_RE = re.compile( 277 r"""^ 278 (\d{4}) # year 279 [-\/]? 280 (\d\d?) # numerical month 281 [-\/]? 282 (\d\d?) # day 283 (?: 284 (?:\s+|[-:Tt]) # separator before clock 285 (\d\d?):?(\d\d) # hour:min 286 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 287 )? # optional clock 288 \s* 289 ([-+]?\d\d?:?(:?\d\d)? 290 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) 291 \s*$""", re.X | re. ASCII) 292def iso2time(text): 293 """ 294 As for http2time, but parses the ISO 8601 formats: 295 296 1994-02-03 14:15:29 -0100 -- ISO 8601 format 297 1994-02-03 14:15:29 -- zone is optional 298 1994-02-03 -- only date 299 1994-02-03T14:15:29 -- Use T as separator 300 19940203T141529Z -- ISO 8601 compact format 301 19940203 -- only date 302 303 """ 304 # clean up 305 text = text.lstrip() 306 307 # tz is time zone specifier string 308 day, mon, yr, hr, min, sec, tz = [None]*7 309 310 # loose regexp parse 311 m = ISO_DATE_RE.search(text) 312 if m is not None: 313 # XXX there's an extra bit of the timezone I'm ignoring here: is 314 # this the right thing to do? 315 yr, mon, day, hr, min, sec, tz, _ = m.groups() 316 else: 317 return None # bad format 318 319 return _str2time(day, mon, yr, hr, min, sec, tz) 320 321 322# Header parsing 323# ----------------------------------------------------------------------------- 324 325def unmatched(match): 326 """Return unmatched part of re.Match object.""" 327 start, end = match.span(0) 328 return match.string[:start]+match.string[end:] 329 330HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 331HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 332HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 333HEADER_ESCAPE_RE = re.compile(r"\\(.)") 334def split_header_words(header_values): 335 r"""Parse header values into a list of lists containing key,value pairs. 336 337 The function knows how to deal with ",", ";" and "=" as well as quoted 338 values after "=". A list of space separated tokens are parsed as if they 339 were separated by ";". 340 341 If the header_values passed as argument contains multiple values, then they 342 are treated as if they were a single value separated by comma ",". 343 344 This means that this function is useful for parsing header fields that 345 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 346 the requirement for tokens). 347 348 headers = #header 349 header = (token | parameter) *( [";"] (token | parameter)) 350 351 token = 1*<any CHAR except CTLs or separators> 352 separators = "(" | ")" | "<" | ">" | "@" 353 | "," | ";" | ":" | "\" | <"> 354 | "/" | "[" | "]" | "?" | "=" 355 | "{" | "}" | SP | HT 356 357 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 358 qdtext = <any TEXT except <">> 359 quoted-pair = "\" CHAR 360 361 parameter = attribute "=" value 362 attribute = token 363 value = token | quoted-string 364 365 Each header is represented by a list of key/value pairs. The value for a 366 simple token (not part of a parameter) is None. Syntactically incorrect 367 headers will not necessarily be parsed as you would want. 368 369 This is easier to describe with some examples: 370 371 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 372 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 373 >>> split_header_words(['text/html; charset="iso-8859-1"']) 374 [[('text/html', None), ('charset', 'iso-8859-1')]] 375 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 376 [[('Basic', None), ('realm', '"foobar"')]] 377 378 """ 379 assert not isinstance(header_values, str) 380 result = [] 381 for text in header_values: 382 orig_text = text 383 pairs = [] 384 while text: 385 m = HEADER_TOKEN_RE.search(text) 386 if m: 387 text = unmatched(m) 388 name = m.group(1) 389 m = HEADER_QUOTED_VALUE_RE.search(text) 390 if m: # quoted value 391 text = unmatched(m) 392 value = m.group(1) 393 value = HEADER_ESCAPE_RE.sub(r"\1", value) 394 else: 395 m = HEADER_VALUE_RE.search(text) 396 if m: # unquoted value 397 text = unmatched(m) 398 value = m.group(1) 399 value = value.rstrip() 400 else: 401 # no value, a lone token 402 value = None 403 pairs.append((name, value)) 404 elif text.lstrip().startswith(","): 405 # concatenated headers, as per RFC 2616 section 4.2 406 text = text.lstrip()[1:] 407 if pairs: result.append(pairs) 408 pairs = [] 409 else: 410 # skip junk 411 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) 412 assert nr_junk_chars > 0, ( 413 "split_header_words bug: '%s', '%s', %s" % 414 (orig_text, text, pairs)) 415 text = non_junk 416 if pairs: result.append(pairs) 417 return result 418 419HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 420def join_header_words(lists): 421 """Do the inverse (almost) of the conversion done by split_header_words. 422 423 Takes a list of lists of (key, value) pairs and produces a single header 424 value. Attribute values are quoted if needed. 425 426 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) 427 'text/plain; charset="iso-8859-1"' 428 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) 429 'text/plain, charset="iso-8859-1"' 430 431 """ 432 headers = [] 433 for pairs in lists: 434 attr = [] 435 for k, v in pairs: 436 if v is not None: 437 if not re.search(r"^\w+$", v): 438 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 439 v = '"%s"' % v 440 k = "%s=%s" % (k, v) 441 attr.append(k) 442 if attr: headers.append("; ".join(attr)) 443 return ", ".join(headers) 444 445def strip_quotes(text): 446 if text.startswith('"'): 447 text = text[1:] 448 if text.endswith('"'): 449 text = text[:-1] 450 return text 451 452def parse_ns_headers(ns_headers): 453 """Ad-hoc parser for Netscape protocol cookie-attributes. 454 455 The old Netscape cookie format for Set-Cookie can for instance contain 456 an unquoted "," in the expires field, so we have to use this ad-hoc 457 parser instead of split_header_words. 458 459 XXX This may not make the best possible effort to parse all the crap 460 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 461 parser is probably better, so could do worse than following that if 462 this ever gives any trouble. 463 464 Currently, this is also used for parsing RFC 2109 cookies. 465 466 """ 467 known_attrs = ("expires", "domain", "path", "secure", 468 # RFC 2109 attrs (may turn up in Netscape cookies, too) 469 "version", "port", "max-age") 470 471 result = [] 472 for ns_header in ns_headers: 473 pairs = [] 474 version_set = False 475 476 # XXX: The following does not strictly adhere to RFCs in that empty 477 # names and values are legal (the former will only appear once and will 478 # be overwritten if multiple occurrences are present). This is 479 # mostly to deal with backwards compatibility. 480 for ii, param in enumerate(ns_header.split(';')): 481 param = param.strip() 482 483 key, sep, val = param.partition('=') 484 key = key.strip() 485 486 if not key: 487 if ii == 0: 488 break 489 else: 490 continue 491 492 # allow for a distinction between present and empty and missing 493 # altogether 494 val = val.strip() if sep else None 495 496 if ii != 0: 497 lc = key.lower() 498 if lc in known_attrs: 499 key = lc 500 501 if key == "version": 502 # This is an RFC 2109 cookie. 503 if val is not None: 504 val = strip_quotes(val) 505 version_set = True 506 elif key == "expires": 507 # convert expires date to seconds since epoch 508 if val is not None: 509 val = http2time(strip_quotes(val)) # None if invalid 510 pairs.append((key, val)) 511 512 if pairs: 513 if not version_set: 514 pairs.append(("version", "0")) 515 result.append(pairs) 516 517 return result 518 519 520IPV4_RE = re.compile(r"\.\d+$", re.ASCII) 521def is_HDN(text): 522 """Return True if text is a host domain name.""" 523 # XXX 524 # This may well be wrong. Which RFC is HDN defined in, if any (for 525 # the purposes of RFC 2965)? 526 # For the current implementation, what about IPv6? Remember to look 527 # at other uses of IPV4_RE also, if change this. 528 if IPV4_RE.search(text): 529 return False 530 if text == "": 531 return False 532 if text[0] == "." or text[-1] == ".": 533 return False 534 return True 535 536def domain_match(A, B): 537 """Return True if domain A domain-matches domain B, according to RFC 2965. 538 539 A and B may be host domain names or IP addresses. 540 541 RFC 2965, section 1: 542 543 Host names can be specified either as an IP address or a HDN string. 544 Sometimes we compare one host name with another. (Such comparisons SHALL 545 be case-insensitive.) Host A's name domain-matches host B's if 546 547 * their host name strings string-compare equal; or 548 549 * A is a HDN string and has the form NB, where N is a non-empty 550 name string, B has the form .B', and B' is a HDN string. (So, 551 x.y.com domain-matches .Y.com but not Y.com.) 552 553 Note that domain-match is not a commutative operation: a.b.c.com 554 domain-matches .c.com, but not the reverse. 555 556 """ 557 # Note that, if A or B are IP addresses, the only relevant part of the 558 # definition of the domain-match algorithm is the direct string-compare. 559 A = A.lower() 560 B = B.lower() 561 if A == B: 562 return True 563 if not is_HDN(A): 564 return False 565 i = A.rfind(B) 566 if i == -1 or i == 0: 567 # A does not have form NB, or N is the empty string 568 return False 569 if not B.startswith("."): 570 return False 571 if not is_HDN(B[1:]): 572 return False 573 return True 574 575def liberal_is_HDN(text): 576 """Return True if text is a sort-of-like a host domain name. 577 578 For accepting/blocking domains. 579 580 """ 581 if IPV4_RE.search(text): 582 return False 583 return True 584 585def user_domain_match(A, B): 586 """For blocking/accepting domains. 587 588 A and B may be host domain names or IP addresses. 589 590 """ 591 A = A.lower() 592 B = B.lower() 593 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 594 if A == B: 595 # equal IP addresses 596 return True 597 return False 598 initial_dot = B.startswith(".") 599 if initial_dot and A.endswith(B): 600 return True 601 if not initial_dot and A == B: 602 return True 603 return False 604 605cut_port_re = re.compile(r":\d+$", re.ASCII) 606def request_host(request): 607 """Return request-host, as defined by RFC 2965. 608 609 Variation from RFC: returned value is lowercased, for convenient 610 comparison. 611 612 """ 613 url = request.get_full_url() 614 host = urllib.parse.urlparse(url)[1] 615 if host == "": 616 host = request.get_header("Host", "") 617 618 # remove port, if present 619 host = cut_port_re.sub("", host, 1) 620 return host.lower() 621 622def eff_request_host(request): 623 """Return a tuple (request-host, effective request-host name). 624 625 As defined by RFC 2965, except both are lowercased. 626 627 """ 628 erhn = req_host = request_host(request) 629 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 630 erhn = req_host + ".local" 631 return req_host, erhn 632 633def request_path(request): 634 """Path component of request-URI, as defined by RFC 2965.""" 635 url = request.get_full_url() 636 parts = urllib.parse.urlsplit(url) 637 path = escape_path(parts.path) 638 if not path.startswith("/"): 639 # fix bad RFC 2396 absoluteURI 640 path = "/" + path 641 return path 642 643def request_port(request): 644 host = request.host 645 i = host.find(':') 646 if i >= 0: 647 port = host[i+1:] 648 try: 649 int(port) 650 except ValueError: 651 _debug("nonnumeric port: '%s'", port) 652 return None 653 else: 654 port = DEFAULT_HTTP_PORT 655 return port 656 657# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 658# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 659HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 660ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 661def uppercase_escaped_char(match): 662 return "%%%s" % match.group(1).upper() 663def escape_path(path): 664 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 665 # There's no knowing what character encoding was used to create URLs 666 # containing %-escapes, but since we have to pick one to escape invalid 667 # path characters, we pick UTF-8, as recommended in the HTML 4.0 668 # specification: 669 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 670 # And here, kind of: draft-fielding-uri-rfc2396bis-03 671 # (And in draft IRI specification: draft-duerst-iri-05) 672 # (And here, for new URI schemes: RFC 2718) 673 path = urllib.parse.quote(path, HTTP_PATH_SAFE) 674 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 675 return path 676 677def reach(h): 678 """Return reach of host h, as defined by RFC 2965, section 1. 679 680 The reach R of a host name H is defined as follows: 681 682 * If 683 684 - H is the host domain name of a host; and, 685 686 - H has the form A.B; and 687 688 - A has no embedded (that is, interior) dots; and 689 690 - B has at least one embedded dot, or B is the string "local". 691 then the reach of H is .B. 692 693 * Otherwise, the reach of H is H. 694 695 >>> reach("www.acme.com") 696 '.acme.com' 697 >>> reach("acme.com") 698 'acme.com' 699 >>> reach("acme.local") 700 '.local' 701 702 """ 703 i = h.find(".") 704 if i >= 0: 705 #a = h[:i] # this line is only here to show what a is 706 b = h[i+1:] 707 i = b.find(".") 708 if is_HDN(h) and (i >= 0 or b == "local"): 709 return "."+b 710 return h 711 712def is_third_party(request): 713 """ 714 715 RFC 2965, section 3.3.6: 716 717 An unverifiable transaction is to a third-party host if its request- 718 host U does not domain-match the reach R of the request-host O in the 719 origin transaction. 720 721 """ 722 req_host = request_host(request) 723 if not domain_match(req_host, reach(request.origin_req_host)): 724 return True 725 else: 726 return False 727 728 729class Cookie: 730 """HTTP Cookie. 731 732 This class represents both Netscape and RFC 2965 cookies. 733 734 This is deliberately a very simple class. It just holds attributes. It's 735 possible to construct Cookie instances that don't comply with the cookie 736 standards. CookieJar.make_cookies is the factory function for Cookie 737 objects -- it deals with cookie parsing, supplying defaults, and 738 normalising to the representation used in this class. CookiePolicy is 739 responsible for checking them to see whether they should be accepted from 740 and returned to the server. 741 742 Note that the port may be present in the headers, but unspecified ("Port" 743 rather than"Port=80", for example); if this is the case, port is None. 744 745 """ 746 747 def __init__(self, version, name, value, 748 port, port_specified, 749 domain, domain_specified, domain_initial_dot, 750 path, path_specified, 751 secure, 752 expires, 753 discard, 754 comment, 755 comment_url, 756 rest, 757 rfc2109=False, 758 ): 759 760 if version is not None: version = int(version) 761 if expires is not None: expires = int(float(expires)) 762 if port is None and port_specified is True: 763 raise ValueError("if port is None, port_specified must be false") 764 765 self.version = version 766 self.name = name 767 self.value = value 768 self.port = port 769 self.port_specified = port_specified 770 # normalise case, as per RFC 2965 section 3.3.3 771 self.domain = domain.lower() 772 self.domain_specified = domain_specified 773 # Sigh. We need to know whether the domain given in the 774 # cookie-attribute had an initial dot, in order to follow RFC 2965 775 # (as clarified in draft errata). Needed for the returned $Domain 776 # value. 777 self.domain_initial_dot = domain_initial_dot 778 self.path = path 779 self.path_specified = path_specified 780 self.secure = secure 781 self.expires = expires 782 self.discard = discard 783 self.comment = comment 784 self.comment_url = comment_url 785 self.rfc2109 = rfc2109 786 787 self._rest = copy.copy(rest) 788 789 def has_nonstandard_attr(self, name): 790 return name in self._rest 791 def get_nonstandard_attr(self, name, default=None): 792 return self._rest.get(name, default) 793 def set_nonstandard_attr(self, name, value): 794 self._rest[name] = value 795 796 def is_expired(self, now=None): 797 if now is None: now = time.time() 798 if (self.expires is not None) and (self.expires <= now): 799 return True 800 return False 801 802 def __str__(self): 803 if self.port is None: p = "" 804 else: p = ":"+self.port 805 limit = self.domain + p + self.path 806 if self.value is not None: 807 namevalue = "%s=%s" % (self.name, self.value) 808 else: 809 namevalue = self.name 810 return "<Cookie %s for %s>" % (namevalue, limit) 811 812 def __repr__(self): 813 args = [] 814 for name in ("version", "name", "value", 815 "port", "port_specified", 816 "domain", "domain_specified", "domain_initial_dot", 817 "path", "path_specified", 818 "secure", "expires", "discard", "comment", "comment_url", 819 ): 820 attr = getattr(self, name) 821 args.append("%s=%s" % (name, repr(attr))) 822 args.append("rest=%s" % repr(self._rest)) 823 args.append("rfc2109=%s" % repr(self.rfc2109)) 824 return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) 825 826 827class CookiePolicy: 828 """Defines which cookies get accepted from and returned to server. 829 830 May also modify cookies, though this is probably a bad idea. 831 832 The subclass DefaultCookiePolicy defines the standard rules for Netscape 833 and RFC 2965 cookies -- override that if you want a customized policy. 834 835 """ 836 def set_ok(self, cookie, request): 837 """Return true if (and only if) cookie should be accepted from server. 838 839 Currently, pre-expired cookies never get this far -- the CookieJar 840 class deletes such cookies itself. 841 842 """ 843 raise NotImplementedError() 844 845 def return_ok(self, cookie, request): 846 """Return true if (and only if) cookie should be returned to server.""" 847 raise NotImplementedError() 848 849 def domain_return_ok(self, domain, request): 850 """Return false if cookies should not be returned, given cookie domain. 851 """ 852 return True 853 854 def path_return_ok(self, path, request): 855 """Return false if cookies should not be returned, given cookie path. 856 """ 857 return True 858 859 860class DefaultCookiePolicy(CookiePolicy): 861 """Implements the standard rules for accepting and returning cookies.""" 862 863 DomainStrictNoDots = 1 864 DomainStrictNonDomain = 2 865 DomainRFC2965Match = 4 866 867 DomainLiberal = 0 868 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 869 870 def __init__(self, 871 blocked_domains=None, allowed_domains=None, 872 netscape=True, rfc2965=False, 873 rfc2109_as_netscape=None, 874 hide_cookie2=False, 875 strict_domain=False, 876 strict_rfc2965_unverifiable=True, 877 strict_ns_unverifiable=False, 878 strict_ns_domain=DomainLiberal, 879 strict_ns_set_initial_dollar=False, 880 strict_ns_set_path=False, 881 ): 882 """Constructor arguments should be passed as keyword arguments only.""" 883 self.netscape = netscape 884 self.rfc2965 = rfc2965 885 self.rfc2109_as_netscape = rfc2109_as_netscape 886 self.hide_cookie2 = hide_cookie2 887 self.strict_domain = strict_domain 888 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 889 self.strict_ns_unverifiable = strict_ns_unverifiable 890 self.strict_ns_domain = strict_ns_domain 891 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 892 self.strict_ns_set_path = strict_ns_set_path 893 894 if blocked_domains is not None: 895 self._blocked_domains = tuple(blocked_domains) 896 else: 897 self._blocked_domains = () 898 899 if allowed_domains is not None: 900 allowed_domains = tuple(allowed_domains) 901 self._allowed_domains = allowed_domains 902 903 def blocked_domains(self): 904 """Return the sequence of blocked domains (as a tuple).""" 905 return self._blocked_domains 906 def set_blocked_domains(self, blocked_domains): 907 """Set the sequence of blocked domains.""" 908 self._blocked_domains = tuple(blocked_domains) 909 910 def is_blocked(self, domain): 911 for blocked_domain in self._blocked_domains: 912 if user_domain_match(domain, blocked_domain): 913 return True 914 return False 915 916 def allowed_domains(self): 917 """Return None, or the sequence of allowed domains (as a tuple).""" 918 return self._allowed_domains 919 def set_allowed_domains(self, allowed_domains): 920 """Set the sequence of allowed domains, or None.""" 921 if allowed_domains is not None: 922 allowed_domains = tuple(allowed_domains) 923 self._allowed_domains = allowed_domains 924 925 def is_not_allowed(self, domain): 926 if self._allowed_domains is None: 927 return False 928 for allowed_domain in self._allowed_domains: 929 if user_domain_match(domain, allowed_domain): 930 return False 931 return True 932 933 def set_ok(self, cookie, request): 934 """ 935 If you override .set_ok(), be sure to call this method. If it returns 936 false, so should your subclass (assuming your subclass wants to be more 937 strict about which cookies to accept). 938 939 """ 940 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 941 942 assert cookie.name is not None 943 944 for n in "version", "verifiability", "name", "path", "domain", "port": 945 fn_name = "set_ok_"+n 946 fn = getattr(self, fn_name) 947 if not fn(cookie, request): 948 return False 949 950 return True 951 952 def set_ok_version(self, cookie, request): 953 if cookie.version is None: 954 # Version is always set to 0 by parse_ns_headers if it's a Netscape 955 # cookie, so this must be an invalid RFC 2965 cookie. 956 _debug(" Set-Cookie2 without version attribute (%s=%s)", 957 cookie.name, cookie.value) 958 return False 959 if cookie.version > 0 and not self.rfc2965: 960 _debug(" RFC 2965 cookies are switched off") 961 return False 962 elif cookie.version == 0 and not self.netscape: 963 _debug(" Netscape cookies are switched off") 964 return False 965 return True 966 967 def set_ok_verifiability(self, cookie, request): 968 if request.unverifiable and is_third_party(request): 969 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 970 _debug(" third-party RFC 2965 cookie during " 971 "unverifiable transaction") 972 return False 973 elif cookie.version == 0 and self.strict_ns_unverifiable: 974 _debug(" third-party Netscape cookie during " 975 "unverifiable transaction") 976 return False 977 return True 978 979 def set_ok_name(self, cookie, request): 980 # Try and stop servers setting V0 cookies designed to hack other 981 # servers that know both V0 and V1 protocols. 982 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 983 cookie.name.startswith("$")): 984 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 985 return False 986 return True 987 988 def set_ok_path(self, cookie, request): 989 if cookie.path_specified: 990 req_path = request_path(request) 991 if ((cookie.version > 0 or 992 (cookie.version == 0 and self.strict_ns_set_path)) and 993 not self.path_return_ok(cookie.path, request)): 994 _debug(" path attribute %s is not a prefix of request " 995 "path %s", cookie.path, req_path) 996 return False 997 return True 998 999 def set_ok_domain(self, cookie, request): 1000 if self.is_blocked(cookie.domain): 1001 _debug(" domain %s is in user block-list", cookie.domain) 1002 return False 1003 if self.is_not_allowed(cookie.domain): 1004 _debug(" domain %s is not in user allow-list", cookie.domain) 1005 return False 1006 if cookie.domain_specified: 1007 req_host, erhn = eff_request_host(request) 1008 domain = cookie.domain 1009 if self.strict_domain and (domain.count(".") >= 2): 1010 # XXX This should probably be compared with the Konqueror 1011 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1012 # losing battle. 1013 i = domain.rfind(".") 1014 j = domain.rfind(".", 0, i) 1015 if j == 0: # domain like .foo.bar 1016 tld = domain[i+1:] 1017 sld = domain[j+1:i] 1018 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1019 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1020 "info", "jobs", "mobi", "museum", "name", "pro", 1021 "travel", "eu") and len(tld) == 2: 1022 # domain like .co.uk 1023 _debug(" country-code second level domain %s", domain) 1024 return False 1025 if domain.startswith("."): 1026 undotted_domain = domain[1:] 1027 else: 1028 undotted_domain = domain 1029 embedded_dots = (undotted_domain.find(".") >= 0) 1030 if not embedded_dots and domain != ".local": 1031 _debug(" non-local domain %s contains no embedded dot", 1032 domain) 1033 return False 1034 if cookie.version == 0: 1035 if (not erhn.endswith(domain) and 1036 (not erhn.startswith(".") and 1037 not ("."+erhn).endswith(domain))): 1038 _debug(" effective request-host %s (even with added " 1039 "initial dot) does not end with %s", 1040 erhn, domain) 1041 return False 1042 if (cookie.version > 0 or 1043 (self.strict_ns_domain & self.DomainRFC2965Match)): 1044 if not domain_match(erhn, domain): 1045 _debug(" effective request-host %s does not domain-match " 1046 "%s", erhn, domain) 1047 return False 1048 if (cookie.version > 0 or 1049 (self.strict_ns_domain & self.DomainStrictNoDots)): 1050 host_prefix = req_host[:-len(domain)] 1051 if (host_prefix.find(".") >= 0 and 1052 not IPV4_RE.search(req_host)): 1053 _debug(" host prefix %s for domain %s contains a dot", 1054 host_prefix, domain) 1055 return False 1056 return True 1057 1058 def set_ok_port(self, cookie, request): 1059 if cookie.port_specified: 1060 req_port = request_port(request) 1061 if req_port is None: 1062 req_port = "80" 1063 else: 1064 req_port = str(req_port) 1065 for p in cookie.port.split(","): 1066 try: 1067 int(p) 1068 except ValueError: 1069 _debug(" bad port %s (not numeric)", p) 1070 return False 1071 if p == req_port: 1072 break 1073 else: 1074 _debug(" request port (%s) not found in %s", 1075 req_port, cookie.port) 1076 return False 1077 return True 1078 1079 def return_ok(self, cookie, request): 1080 """ 1081 If you override .return_ok(), be sure to call this method. If it 1082 returns false, so should your subclass (assuming your subclass wants to 1083 be more strict about which cookies to return). 1084 1085 """ 1086 # Path has already been checked by .path_return_ok(), and domain 1087 # blocking done by .domain_return_ok(). 1088 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1089 1090 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1091 fn_name = "return_ok_"+n 1092 fn = getattr(self, fn_name) 1093 if not fn(cookie, request): 1094 return False 1095 return True 1096 1097 def return_ok_version(self, cookie, request): 1098 if cookie.version > 0 and not self.rfc2965: 1099 _debug(" RFC 2965 cookies are switched off") 1100 return False 1101 elif cookie.version == 0 and not self.netscape: 1102 _debug(" Netscape cookies are switched off") 1103 return False 1104 return True 1105 1106 def return_ok_verifiability(self, cookie, request): 1107 if request.unverifiable and is_third_party(request): 1108 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1109 _debug(" third-party RFC 2965 cookie during unverifiable " 1110 "transaction") 1111 return False 1112 elif cookie.version == 0 and self.strict_ns_unverifiable: 1113 _debug(" third-party Netscape cookie during unverifiable " 1114 "transaction") 1115 return False 1116 return True 1117 1118 def return_ok_secure(self, cookie, request): 1119 if cookie.secure and request.type != "https": 1120 _debug(" secure cookie with non-secure request") 1121 return False 1122 return True 1123 1124 def return_ok_expires(self, cookie, request): 1125 if cookie.is_expired(self._now): 1126 _debug(" cookie expired") 1127 return False 1128 return True 1129 1130 def return_ok_port(self, cookie, request): 1131 if cookie.port: 1132 req_port = request_port(request) 1133 if req_port is None: 1134 req_port = "80" 1135 for p in cookie.port.split(","): 1136 if p == req_port: 1137 break 1138 else: 1139 _debug(" request port %s does not match cookie port %s", 1140 req_port, cookie.port) 1141 return False 1142 return True 1143 1144 def return_ok_domain(self, cookie, request): 1145 req_host, erhn = eff_request_host(request) 1146 domain = cookie.domain 1147 1148 if domain and not domain.startswith("."): 1149 dotdomain = "." + domain 1150 else: 1151 dotdomain = domain 1152 1153 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1154 if (cookie.version == 0 and 1155 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1156 not cookie.domain_specified and domain != erhn): 1157 _debug(" cookie with unspecified domain does not string-compare " 1158 "equal to request domain") 1159 return False 1160 1161 if cookie.version > 0 and not domain_match(erhn, domain): 1162 _debug(" effective request-host name %s does not domain-match " 1163 "RFC 2965 cookie domain %s", erhn, domain) 1164 return False 1165 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain): 1166 _debug(" request-host %s does not match Netscape cookie domain " 1167 "%s", req_host, domain) 1168 return False 1169 return True 1170 1171 def domain_return_ok(self, domain, request): 1172 # Liberal check of. This is here as an optimization to avoid 1173 # having to load lots of MSIE cookie files unless necessary. 1174 req_host, erhn = eff_request_host(request) 1175 if not req_host.startswith("."): 1176 req_host = "."+req_host 1177 if not erhn.startswith("."): 1178 erhn = "."+erhn 1179 if domain and not domain.startswith("."): 1180 dotdomain = "." + domain 1181 else: 1182 dotdomain = domain 1183 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)): 1184 #_debug(" request domain %s does not match cookie domain %s", 1185 # req_host, domain) 1186 return False 1187 1188 if self.is_blocked(domain): 1189 _debug(" domain %s is in user block-list", domain) 1190 return False 1191 if self.is_not_allowed(domain): 1192 _debug(" domain %s is not in user allow-list", domain) 1193 return False 1194 1195 return True 1196 1197 def path_return_ok(self, path, request): 1198 _debug("- checking cookie path=%s", path) 1199 req_path = request_path(request) 1200 pathlen = len(path) 1201 if req_path == path: 1202 return True 1203 elif (req_path.startswith(path) and 1204 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")): 1205 return True 1206 1207 _debug(" %s does not path-match %s", req_path, path) 1208 return False 1209 1210def vals_sorted_by_key(adict): 1211 keys = sorted(adict.keys()) 1212 return map(adict.get, keys) 1213 1214def deepvalues(mapping): 1215 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1216 values = vals_sorted_by_key(mapping) 1217 for obj in values: 1218 mapping = False 1219 try: 1220 obj.items 1221 except AttributeError: 1222 pass 1223 else: 1224 mapping = True 1225 yield from deepvalues(obj) 1226 if not mapping: 1227 yield obj 1228 1229 1230# Used as second parameter to dict.get() method, to distinguish absent 1231# dict key from one with a None value. 1232class Absent: pass 1233 1234class CookieJar: 1235 """Collection of HTTP cookies. 1236 1237 You may not need to know about this class: try 1238 urllib.request.build_opener(HTTPCookieProcessor).open(url). 1239 """ 1240 1241 non_word_re = re.compile(r"\W") 1242 quote_re = re.compile(r"([\"\\])") 1243 strict_domain_re = re.compile(r"\.?[^.]*") 1244 domain_re = re.compile(r"[^.]*") 1245 dots_re = re.compile(r"^\.+") 1246 1247 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) 1248 1249 def __init__(self, policy=None): 1250 if policy is None: 1251 policy = DefaultCookiePolicy() 1252 self._policy = policy 1253 1254 self._cookies_lock = _threading.RLock() 1255 self._cookies = {} 1256 1257 def set_policy(self, policy): 1258 self._policy = policy 1259 1260 def _cookies_for_domain(self, domain, request): 1261 cookies = [] 1262 if not self._policy.domain_return_ok(domain, request): 1263 return [] 1264 _debug("Checking %s for cookies to return", domain) 1265 cookies_by_path = self._cookies[domain] 1266 for path in cookies_by_path.keys(): 1267 if not self._policy.path_return_ok(path, request): 1268 continue 1269 cookies_by_name = cookies_by_path[path] 1270 for cookie in cookies_by_name.values(): 1271 if not self._policy.return_ok(cookie, request): 1272 _debug(" not returning cookie") 1273 continue 1274 _debug(" it's a match") 1275 cookies.append(cookie) 1276 return cookies 1277 1278 def _cookies_for_request(self, request): 1279 """Return a list of cookies to be returned to server.""" 1280 cookies = [] 1281 for domain in self._cookies.keys(): 1282 cookies.extend(self._cookies_for_domain(domain, request)) 1283 return cookies 1284 1285 def _cookie_attrs(self, cookies): 1286 """Return a list of cookie-attributes to be returned to server. 1287 1288 like ['foo="bar"; $Path="/"', ...] 1289 1290 The $Version attribute is also added when appropriate (currently only 1291 once per request). 1292 1293 """ 1294 # add cookies in order of most specific (ie. longest) path first 1295 cookies.sort(key=lambda a: len(a.path), reverse=True) 1296 1297 version_set = False 1298 1299 attrs = [] 1300 for cookie in cookies: 1301 # set version of Cookie header 1302 # XXX 1303 # What should it be if multiple matching Set-Cookie headers have 1304 # different versions themselves? 1305 # Answer: there is no answer; was supposed to be settled by 1306 # RFC 2965 errata, but that may never appear... 1307 version = cookie.version 1308 if not version_set: 1309 version_set = True 1310 if version > 0: 1311 attrs.append("$Version=%s" % version) 1312 1313 # quote cookie value if necessary 1314 # (not for Netscape protocol, which already has any quotes 1315 # intact, due to the poorly-specified Netscape Cookie: syntax) 1316 if ((cookie.value is not None) and 1317 self.non_word_re.search(cookie.value) and version > 0): 1318 value = self.quote_re.sub(r"\\\1", cookie.value) 1319 else: 1320 value = cookie.value 1321 1322 # add cookie-attributes to be returned in Cookie header 1323 if cookie.value is None: 1324 attrs.append(cookie.name) 1325 else: 1326 attrs.append("%s=%s" % (cookie.name, value)) 1327 if version > 0: 1328 if cookie.path_specified: 1329 attrs.append('$Path="%s"' % cookie.path) 1330 if cookie.domain.startswith("."): 1331 domain = cookie.domain 1332 if (not cookie.domain_initial_dot and 1333 domain.startswith(".")): 1334 domain = domain[1:] 1335 attrs.append('$Domain="%s"' % domain) 1336 if cookie.port is not None: 1337 p = "$Port" 1338 if cookie.port_specified: 1339 p = p + ('="%s"' % cookie.port) 1340 attrs.append(p) 1341 1342 return attrs 1343 1344 def add_cookie_header(self, request): 1345 """Add correct Cookie: header to request (urllib.request.Request object). 1346 1347 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1348 1349 """ 1350 _debug("add_cookie_header") 1351 self._cookies_lock.acquire() 1352 try: 1353 1354 self._policy._now = self._now = int(time.time()) 1355 1356 cookies = self._cookies_for_request(request) 1357 1358 attrs = self._cookie_attrs(cookies) 1359 if attrs: 1360 if not request.has_header("Cookie"): 1361 request.add_unredirected_header( 1362 "Cookie", "; ".join(attrs)) 1363 1364 # if necessary, advertise that we know RFC 2965 1365 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1366 not request.has_header("Cookie2")): 1367 for cookie in cookies: 1368 if cookie.version != 1: 1369 request.add_unredirected_header("Cookie2", '$Version="1"') 1370 break 1371 1372 finally: 1373 self._cookies_lock.release() 1374 1375 self.clear_expired_cookies() 1376 1377 def _normalized_cookie_tuples(self, attrs_set): 1378 """Return list of tuples containing normalised cookie information. 1379 1380 attrs_set is the list of lists of key,value pairs extracted from 1381 the Set-Cookie or Set-Cookie2 headers. 1382 1383 Tuples are name, value, standard, rest, where name and value are the 1384 cookie name and value, standard is a dictionary containing the standard 1385 cookie-attributes (discard, secure, version, expires or max-age, 1386 domain, path and port) and rest is a dictionary containing the rest of 1387 the cookie-attributes. 1388 1389 """ 1390 cookie_tuples = [] 1391 1392 boolean_attrs = "discard", "secure" 1393 value_attrs = ("version", 1394 "expires", "max-age", 1395 "domain", "path", "port", 1396 "comment", "commenturl") 1397 1398 for cookie_attrs in attrs_set: 1399 name, value = cookie_attrs[0] 1400 1401 # Build dictionary of standard cookie-attributes (standard) and 1402 # dictionary of other cookie-attributes (rest). 1403 1404 # Note: expiry time is normalised to seconds since epoch. V0 1405 # cookies should have the Expires cookie-attribute, and V1 cookies 1406 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1407 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1408 # accept either (but prefer Max-Age). 1409 max_age_set = False 1410 1411 bad_cookie = False 1412 1413 standard = {} 1414 rest = {} 1415 for k, v in cookie_attrs[1:]: 1416 lc = k.lower() 1417 # don't lose case distinction for unknown fields 1418 if lc in value_attrs or lc in boolean_attrs: 1419 k = lc 1420 if k in boolean_attrs and v is None: 1421 # boolean cookie-attribute is present, but has no value 1422 # (like "discard", rather than "port=80") 1423 v = True 1424 if k in standard: 1425 # only first value is significant 1426 continue 1427 if k == "domain": 1428 if v is None: 1429 _debug(" missing value for domain attribute") 1430 bad_cookie = True 1431 break 1432 # RFC 2965 section 3.3.3 1433 v = v.lower() 1434 if k == "expires": 1435 if max_age_set: 1436 # Prefer max-age to expires (like Mozilla) 1437 continue 1438 if v is None: 1439 _debug(" missing or invalid value for expires " 1440 "attribute: treating as session cookie") 1441 continue 1442 if k == "max-age": 1443 max_age_set = True 1444 try: 1445 v = int(v) 1446 except ValueError: 1447 _debug(" missing or invalid (non-numeric) value for " 1448 "max-age attribute") 1449 bad_cookie = True 1450 break 1451 # convert RFC 2965 Max-Age to seconds since epoch 1452 # XXX Strictly you're supposed to follow RFC 2616 1453 # age-calculation rules. Remember that zero Max-Age 1454 # is a request to discard (old and new) cookie, though. 1455 k = "expires" 1456 v = self._now + v 1457 if (k in value_attrs) or (k in boolean_attrs): 1458 if (v is None and 1459 k not in ("port", "comment", "commenturl")): 1460 _debug(" missing value for %s attribute" % k) 1461 bad_cookie = True 1462 break 1463 standard[k] = v 1464 else: 1465 rest[k] = v 1466 1467 if bad_cookie: 1468 continue 1469 1470 cookie_tuples.append((name, value, standard, rest)) 1471 1472 return cookie_tuples 1473 1474 def _cookie_from_cookie_tuple(self, tup, request): 1475 # standard is dict of standard cookie-attributes, rest is dict of the 1476 # rest of them 1477 name, value, standard, rest = tup 1478 1479 domain = standard.get("domain", Absent) 1480 path = standard.get("path", Absent) 1481 port = standard.get("port", Absent) 1482 expires = standard.get("expires", Absent) 1483 1484 # set the easy defaults 1485 version = standard.get("version", None) 1486 if version is not None: 1487 try: 1488 version = int(version) 1489 except ValueError: 1490 return None # invalid version, ignore cookie 1491 secure = standard.get("secure", False) 1492 # (discard is also set if expires is Absent) 1493 discard = standard.get("discard", False) 1494 comment = standard.get("comment", None) 1495 comment_url = standard.get("commenturl", None) 1496 1497 # set default path 1498 if path is not Absent and path != "": 1499 path_specified = True 1500 path = escape_path(path) 1501 else: 1502 path_specified = False 1503 path = request_path(request) 1504 i = path.rfind("/") 1505 if i != -1: 1506 if version == 0: 1507 # Netscape spec parts company from reality here 1508 path = path[:i] 1509 else: 1510 path = path[:i+1] 1511 if len(path) == 0: path = "/" 1512 1513 # set default domain 1514 domain_specified = domain is not Absent 1515 # but first we have to remember whether it starts with a dot 1516 domain_initial_dot = False 1517 if domain_specified: 1518 domain_initial_dot = bool(domain.startswith(".")) 1519 if domain is Absent: 1520 req_host, erhn = eff_request_host(request) 1521 domain = erhn 1522 elif not domain.startswith("."): 1523 domain = "."+domain 1524 1525 # set default port 1526 port_specified = False 1527 if port is not Absent: 1528 if port is None: 1529 # Port attr present, but has no value: default to request port. 1530 # Cookie should then only be sent back on that port. 1531 port = request_port(request) 1532 else: 1533 port_specified = True 1534 port = re.sub(r"\s+", "", port) 1535 else: 1536 # No port attr present. Cookie can be sent back on any port. 1537 port = None 1538 1539 # set default expires and discard 1540 if expires is Absent: 1541 expires = None 1542 discard = True 1543 elif expires <= self._now: 1544 # Expiry date in past is request to delete cookie. This can't be 1545 # in DefaultCookiePolicy, because can't delete cookies there. 1546 try: 1547 self.clear(domain, path, name) 1548 except KeyError: 1549 pass 1550 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1551 domain, path, name) 1552 return None 1553 1554 return Cookie(version, 1555 name, value, 1556 port, port_specified, 1557 domain, domain_specified, domain_initial_dot, 1558 path, path_specified, 1559 secure, 1560 expires, 1561 discard, 1562 comment, 1563 comment_url, 1564 rest) 1565 1566 def _cookies_from_attrs_set(self, attrs_set, request): 1567 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1568 1569 cookies = [] 1570 for tup in cookie_tuples: 1571 cookie = self._cookie_from_cookie_tuple(tup, request) 1572 if cookie: cookies.append(cookie) 1573 return cookies 1574 1575 def _process_rfc2109_cookies(self, cookies): 1576 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1577 if rfc2109_as_ns is None: 1578 rfc2109_as_ns = not self._policy.rfc2965 1579 for cookie in cookies: 1580 if cookie.version == 1: 1581 cookie.rfc2109 = True 1582 if rfc2109_as_ns: 1583 # treat 2109 cookies as Netscape cookies rather than 1584 # as RFC2965 cookies 1585 cookie.version = 0 1586 1587 def make_cookies(self, response, request): 1588 """Return sequence of Cookie objects extracted from response object.""" 1589 # get cookie-attributes for RFC 2965 and Netscape protocols 1590 headers = response.info() 1591 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) 1592 ns_hdrs = headers.get_all("Set-Cookie", []) 1593 1594 rfc2965 = self._policy.rfc2965 1595 netscape = self._policy.netscape 1596 1597 if ((not rfc2965_hdrs and not ns_hdrs) or 1598 (not ns_hdrs and not rfc2965) or 1599 (not rfc2965_hdrs and not netscape) or 1600 (not netscape and not rfc2965)): 1601 return [] # no relevant cookie headers: quick exit 1602 1603 try: 1604 cookies = self._cookies_from_attrs_set( 1605 split_header_words(rfc2965_hdrs), request) 1606 except Exception: 1607 _warn_unhandled_exception() 1608 cookies = [] 1609 1610 if ns_hdrs and netscape: 1611 try: 1612 # RFC 2109 and Netscape cookies 1613 ns_cookies = self._cookies_from_attrs_set( 1614 parse_ns_headers(ns_hdrs), request) 1615 except Exception: 1616 _warn_unhandled_exception() 1617 ns_cookies = [] 1618 self._process_rfc2109_cookies(ns_cookies) 1619 1620 # Look for Netscape cookies (from Set-Cookie headers) that match 1621 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1622 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1623 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1624 # bundled in with the Netscape cookies for this purpose, which is 1625 # reasonable behaviour. 1626 if rfc2965: 1627 lookup = {} 1628 for cookie in cookies: 1629 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1630 1631 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1632 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1633 return key not in lookup 1634 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1635 1636 if ns_cookies: 1637 cookies.extend(ns_cookies) 1638 1639 return cookies 1640 1641 def set_cookie_if_ok(self, cookie, request): 1642 """Set a cookie if policy says it's OK to do so.""" 1643 self._cookies_lock.acquire() 1644 try: 1645 self._policy._now = self._now = int(time.time()) 1646 1647 if self._policy.set_ok(cookie, request): 1648 self.set_cookie(cookie) 1649 1650 1651 finally: 1652 self._cookies_lock.release() 1653 1654 def set_cookie(self, cookie): 1655 """Set a cookie, without checking whether or not it should be set.""" 1656 c = self._cookies 1657 self._cookies_lock.acquire() 1658 try: 1659 if cookie.domain not in c: c[cookie.domain] = {} 1660 c2 = c[cookie.domain] 1661 if cookie.path not in c2: c2[cookie.path] = {} 1662 c3 = c2[cookie.path] 1663 c3[cookie.name] = cookie 1664 finally: 1665 self._cookies_lock.release() 1666 1667 def extract_cookies(self, response, request): 1668 """Extract cookies from response, where allowable given the request.""" 1669 _debug("extract_cookies: %s", response.info()) 1670 self._cookies_lock.acquire() 1671 try: 1672 self._policy._now = self._now = int(time.time()) 1673 1674 for cookie in self.make_cookies(response, request): 1675 if self._policy.set_ok(cookie, request): 1676 _debug(" setting cookie: %s", cookie) 1677 self.set_cookie(cookie) 1678 finally: 1679 self._cookies_lock.release() 1680 1681 def clear(self, domain=None, path=None, name=None): 1682 """Clear some cookies. 1683 1684 Invoking this method without arguments will clear all cookies. If 1685 given a single argument, only cookies belonging to that domain will be 1686 removed. If given two arguments, cookies belonging to the specified 1687 path within that domain are removed. If given three arguments, then 1688 the cookie with the specified name, path and domain is removed. 1689 1690 Raises KeyError if no matching cookie exists. 1691 1692 """ 1693 if name is not None: 1694 if (domain is None) or (path is None): 1695 raise ValueError( 1696 "domain and path must be given to remove a cookie by name") 1697 del self._cookies[domain][path][name] 1698 elif path is not None: 1699 if domain is None: 1700 raise ValueError( 1701 "domain must be given to remove cookies by path") 1702 del self._cookies[domain][path] 1703 elif domain is not None: 1704 del self._cookies[domain] 1705 else: 1706 self._cookies = {} 1707 1708 def clear_session_cookies(self): 1709 """Discard all session cookies. 1710 1711 Note that the .save() method won't save session cookies anyway, unless 1712 you ask otherwise by passing a true ignore_discard argument. 1713 1714 """ 1715 self._cookies_lock.acquire() 1716 try: 1717 for cookie in self: 1718 if cookie.discard: 1719 self.clear(cookie.domain, cookie.path, cookie.name) 1720 finally: 1721 self._cookies_lock.release() 1722 1723 def clear_expired_cookies(self): 1724 """Discard all expired cookies. 1725 1726 You probably don't need to call this method: expired cookies are never 1727 sent back to the server (provided you're using DefaultCookiePolicy), 1728 this method is called by CookieJar itself every so often, and the 1729 .save() method won't save expired cookies anyway (unless you ask 1730 otherwise by passing a true ignore_expires argument). 1731 1732 """ 1733 self._cookies_lock.acquire() 1734 try: 1735 now = time.time() 1736 for cookie in self: 1737 if cookie.is_expired(now): 1738 self.clear(cookie.domain, cookie.path, cookie.name) 1739 finally: 1740 self._cookies_lock.release() 1741 1742 def __iter__(self): 1743 return deepvalues(self._cookies) 1744 1745 def __len__(self): 1746 """Return number of contained cookies.""" 1747 i = 0 1748 for cookie in self: i = i + 1 1749 return i 1750 1751 def __repr__(self): 1752 r = [] 1753 for cookie in self: r.append(repr(cookie)) 1754 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1755 1756 def __str__(self): 1757 r = [] 1758 for cookie in self: r.append(str(cookie)) 1759 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1760 1761 1762# derives from OSError for backwards-compatibility with Python 2.4.0 1763class LoadError(OSError): pass 1764 1765class FileCookieJar(CookieJar): 1766 """CookieJar that can be loaded from and saved to a file.""" 1767 1768 def __init__(self, filename=None, delayload=False, policy=None): 1769 """ 1770 Cookies are NOT loaded from the named file until either the .load() or 1771 .revert() method is called. 1772 1773 """ 1774 CookieJar.__init__(self, policy) 1775 if filename is not None: 1776 try: 1777 filename+"" 1778 except: 1779 raise ValueError("filename must be string-like") 1780 self.filename = filename 1781 self.delayload = bool(delayload) 1782 1783 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1784 """Save cookies to a file.""" 1785 raise NotImplementedError() 1786 1787 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1788 """Load cookies from a file.""" 1789 if filename is None: 1790 if self.filename is not None: filename = self.filename 1791 else: raise ValueError(MISSING_FILENAME_TEXT) 1792 1793 with open(filename) as f: 1794 self._really_load(f, filename, ignore_discard, ignore_expires) 1795 1796 def revert(self, filename=None, 1797 ignore_discard=False, ignore_expires=False): 1798 """Clear all cookies and reload cookies from a saved file. 1799 1800 Raises LoadError (or OSError) if reversion is not successful; the 1801 object's state will not be altered if this happens. 1802 1803 """ 1804 if filename is None: 1805 if self.filename is not None: filename = self.filename 1806 else: raise ValueError(MISSING_FILENAME_TEXT) 1807 1808 self._cookies_lock.acquire() 1809 try: 1810 1811 old_state = copy.deepcopy(self._cookies) 1812 self._cookies = {} 1813 try: 1814 self.load(filename, ignore_discard, ignore_expires) 1815 except OSError: 1816 self._cookies = old_state 1817 raise 1818 1819 finally: 1820 self._cookies_lock.release() 1821 1822 1823def lwp_cookie_str(cookie): 1824 """Return string representation of Cookie in the LWP cookie file format. 1825 1826 Actually, the format is extended a bit -- see module docstring. 1827 1828 """ 1829 h = [(cookie.name, cookie.value), 1830 ("path", cookie.path), 1831 ("domain", cookie.domain)] 1832 if cookie.port is not None: h.append(("port", cookie.port)) 1833 if cookie.path_specified: h.append(("path_spec", None)) 1834 if cookie.port_specified: h.append(("port_spec", None)) 1835 if cookie.domain_initial_dot: h.append(("domain_dot", None)) 1836 if cookie.secure: h.append(("secure", None)) 1837 if cookie.expires: h.append(("expires", 1838 time2isoz(float(cookie.expires)))) 1839 if cookie.discard: h.append(("discard", None)) 1840 if cookie.comment: h.append(("comment", cookie.comment)) 1841 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) 1842 1843 keys = sorted(cookie._rest.keys()) 1844 for k in keys: 1845 h.append((k, str(cookie._rest[k]))) 1846 1847 h.append(("version", str(cookie.version))) 1848 1849 return join_header_words([h]) 1850 1851class LWPCookieJar(FileCookieJar): 1852 """ 1853 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. 1854 "Set-Cookie3" is the format used by the libwww-perl library, not known 1855 to be compatible with any browser, but which is easy to read and 1856 doesn't lose information about RFC 2965 cookies. 1857 1858 Additional methods 1859 1860 as_lwp_str(ignore_discard=True, ignore_expired=True) 1861 1862 """ 1863 1864 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): 1865 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. 1866 1867 ignore_discard and ignore_expires: see docstring for FileCookieJar.save 1868 1869 """ 1870 now = time.time() 1871 r = [] 1872 for cookie in self: 1873 if not ignore_discard and cookie.discard: 1874 continue 1875 if not ignore_expires and cookie.is_expired(now): 1876 continue 1877 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) 1878 return "\n".join(r+[""]) 1879 1880 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1881 if filename is None: 1882 if self.filename is not None: filename = self.filename 1883 else: raise ValueError(MISSING_FILENAME_TEXT) 1884 1885 with open(filename, "w") as f: 1886 # There really isn't an LWP Cookies 2.0 format, but this indicates 1887 # that there is extra information in here (domain_dot and 1888 # port_spec) while still being compatible with libwww-perl, I hope. 1889 f.write("#LWP-Cookies-2.0\n") 1890 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) 1891 1892 def _really_load(self, f, filename, ignore_discard, ignore_expires): 1893 magic = f.readline() 1894 if not self.magic_re.search(magic): 1895 msg = ("%r does not look like a Set-Cookie3 (LWP) format " 1896 "file" % filename) 1897 raise LoadError(msg) 1898 1899 now = time.time() 1900 1901 header = "Set-Cookie3:" 1902 boolean_attrs = ("port_spec", "path_spec", "domain_dot", 1903 "secure", "discard") 1904 value_attrs = ("version", 1905 "port", "path", "domain", 1906 "expires", 1907 "comment", "commenturl") 1908 1909 try: 1910 while 1: 1911 line = f.readline() 1912 if line == "": break 1913 if not line.startswith(header): 1914 continue 1915 line = line[len(header):].strip() 1916 1917 for data in split_header_words([line]): 1918 name, value = data[0] 1919 standard = {} 1920 rest = {} 1921 for k in boolean_attrs: 1922 standard[k] = False 1923 for k, v in data[1:]: 1924 if k is not None: 1925 lc = k.lower() 1926 else: 1927 lc = None 1928 # don't lose case distinction for unknown fields 1929 if (lc in value_attrs) or (lc in boolean_attrs): 1930 k = lc 1931 if k in boolean_attrs: 1932 if v is None: v = True 1933 standard[k] = v 1934 elif k in value_attrs: 1935 standard[k] = v 1936 else: 1937 rest[k] = v 1938 1939 h = standard.get 1940 expires = h("expires") 1941 discard = h("discard") 1942 if expires is not None: 1943 expires = iso2time(expires) 1944 if expires is None: 1945 discard = True 1946 domain = h("domain") 1947 domain_specified = domain.startswith(".") 1948 c = Cookie(h("version"), name, value, 1949 h("port"), h("port_spec"), 1950 domain, domain_specified, h("domain_dot"), 1951 h("path"), h("path_spec"), 1952 h("secure"), 1953 expires, 1954 discard, 1955 h("comment"), 1956 h("commenturl"), 1957 rest) 1958 if not ignore_discard and c.discard: 1959 continue 1960 if not ignore_expires and c.is_expired(now): 1961 continue 1962 self.set_cookie(c) 1963 except OSError: 1964 raise 1965 except Exception: 1966 _warn_unhandled_exception() 1967 raise LoadError("invalid Set-Cookie3 format file %r: %r" % 1968 (filename, line)) 1969 1970 1971class MozillaCookieJar(FileCookieJar): 1972 """ 1973 1974 WARNING: you may want to backup your browser's cookies file if you use 1975 this class to save cookies. I *think* it works, but there have been 1976 bugs in the past! 1977 1978 This class differs from CookieJar only in the format it uses to save and 1979 load cookies to and from a file. This class uses the Mozilla/Netscape 1980 `cookies.txt' format. lynx uses this file format, too. 1981 1982 Don't expect cookies saved while the browser is running to be noticed by 1983 the browser (in fact, Mozilla on unix will overwrite your saved cookies if 1984 you change them on disk while it's running; on Windows, you probably can't 1985 save at all while the browser is running). 1986 1987 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to 1988 Netscape cookies on saving. 1989 1990 In particular, the cookie version and port number information is lost, 1991 together with information about whether or not Path, Port and Discard were 1992 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the 1993 domain as set in the HTTP header started with a dot (yes, I'm aware some 1994 domains in Netscape files start with a dot and some don't -- trust me, you 1995 really don't want to know any more about this). 1996 1997 Note that though Mozilla and Netscape use the same format, they use 1998 slightly different headers. The class saves cookies using the Netscape 1999 header by default (Mozilla can cope with that). 2000 2001 """ 2002 magic_re = re.compile("#( Netscape)? HTTP Cookie File") 2003 header = """\ 2004# Netscape HTTP Cookie File 2005# http://curl.haxx.se/rfc/cookie_spec.html 2006# This is a generated file! Do not edit. 2007 2008""" 2009 2010 def _really_load(self, f, filename, ignore_discard, ignore_expires): 2011 now = time.time() 2012 2013 magic = f.readline() 2014 if not self.magic_re.search(magic): 2015 raise LoadError( 2016 "%r does not look like a Netscape format cookies file" % 2017 filename) 2018 2019 try: 2020 while 1: 2021 line = f.readline() 2022 if line == "": break 2023 2024 # last field may be absent, so keep any trailing tab 2025 if line.endswith("\n"): line = line[:-1] 2026 2027 # skip comments and blank lines XXX what is $ for? 2028 if (line.strip().startswith(("#", "$")) or 2029 line.strip() == ""): 2030 continue 2031 2032 domain, domain_specified, path, secure, expires, name, value = \ 2033 line.split("\t") 2034 secure = (secure == "TRUE") 2035 domain_specified = (domain_specified == "TRUE") 2036 if name == "": 2037 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2038 # with no name, whereas http.cookiejar regards it as a 2039 # cookie with no value. 2040 name = value 2041 value = None 2042 2043 initial_dot = domain.startswith(".") 2044 assert domain_specified == initial_dot 2045 2046 discard = False 2047 if expires == "": 2048 expires = None 2049 discard = True 2050 2051 # assume path_specified is false 2052 c = Cookie(0, name, value, 2053 None, False, 2054 domain, domain_specified, initial_dot, 2055 path, False, 2056 secure, 2057 expires, 2058 discard, 2059 None, 2060 None, 2061 {}) 2062 if not ignore_discard and c.discard: 2063 continue 2064 if not ignore_expires and c.is_expired(now): 2065 continue 2066 self.set_cookie(c) 2067 2068 except OSError: 2069 raise 2070 except Exception: 2071 _warn_unhandled_exception() 2072 raise LoadError("invalid Netscape format cookies file %r: %r" % 2073 (filename, line)) 2074 2075 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 2076 if filename is None: 2077 if self.filename is not None: filename = self.filename 2078 else: raise ValueError(MISSING_FILENAME_TEXT) 2079 2080 with open(filename, "w") as f: 2081 f.write(self.header) 2082 now = time.time() 2083 for cookie in self: 2084 if not ignore_discard and cookie.discard: 2085 continue 2086 if not ignore_expires and cookie.is_expired(now): 2087 continue 2088 if cookie.secure: secure = "TRUE" 2089 else: secure = "FALSE" 2090 if cookie.domain.startswith("."): initial_dot = "TRUE" 2091 else: initial_dot = "FALSE" 2092 if cookie.expires is not None: 2093 expires = str(cookie.expires) 2094 else: 2095 expires = "" 2096 if cookie.value is None: 2097 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2098 # with no name, whereas http.cookiejar regards it as a 2099 # cookie with no value. 2100 name = "" 2101 value = cookie.name 2102 else: 2103 name = cookie.name 2104 value = cookie.value 2105 f.write( 2106 "\t".join([cookie.domain, initial_dot, cookie.path, 2107 secure, expires, name, value])+ 2108 "\n") 2109