1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] 30 31import os 32import copy 33import datetime 34import re 35import time 36import urllib.parse, urllib.request 37import threading as _threading 38import http.client # only for the default HTTP port 39from calendar import timegm 40 41debug = False # set to True to enable debugging via the logging module 42logger = None 43 44def _debug(*args): 45 if not debug: 46 return 47 global logger 48 if not logger: 49 import logging 50 logger = logging.getLogger("http.cookiejar") 51 return logger.debug(*args) 52 53HTTPONLY_ATTR = "HTTPOnly" 54HTTPONLY_PREFIX = "#HttpOnly_" 55DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) 56NETSCAPE_MAGIC_RGX = re.compile("#( Netscape)? HTTP Cookie File") 57MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 58 "instance initialised with one)") 59NETSCAPE_HEADER_TEXT = """\ 60# Netscape HTTP Cookie File 61# http://curl.haxx.se/rfc/cookie_spec.html 62# This is a generated file! Do not edit. 63 64""" 65 66def _warn_unhandled_exception(): 67 # There are a few catch-all except: statements in this module, for 68 # catching input that's bad in unexpected ways. Warn if any 69 # exceptions are caught there. 70 import io, warnings, traceback 71 f = io.StringIO() 72 traceback.print_exc(None, f) 73 msg = f.getvalue() 74 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) 75 76 77# Date/time conversion 78# ----------------------------------------------------------------------------- 79 80EPOCH_YEAR = 1970 81def _timegm(tt): 82 year, month, mday, hour, min, sec = tt[:6] 83 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 84 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 85 return timegm(tt) 86 else: 87 return None 88 89DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 90MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 91 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 92MONTHS_LOWER = [] 93for month in MONTHS: MONTHS_LOWER.append(month.lower()) 94 95def time2isoz(t=None): 96 """Return a string representing time in seconds since epoch, t. 97 98 If the function is called without an argument, it will use the current 99 time. 100 101 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 102 representing Universal Time (UTC, aka GMT). An example of this format is: 103 104 1994-11-24 08:49:37Z 105 106 """ 107 if t is None: 108 dt = datetime.datetime.utcnow() 109 else: 110 dt = datetime.datetime.utcfromtimestamp(t) 111 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 112 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) 113 114def time2netscape(t=None): 115 """Return a string representing time in seconds since epoch, t. 116 117 If the function is called without an argument, it will use the current 118 time. 119 120 The format of the returned string is like this: 121 122 Wed, DD-Mon-YYYY HH:MM:SS GMT 123 124 """ 125 if t is None: 126 dt = datetime.datetime.utcnow() 127 else: 128 dt = datetime.datetime.utcfromtimestamp(t) 129 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 130 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], 131 dt.year, dt.hour, dt.minute, dt.second) 132 133 134UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 135 136TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) 137def offset_from_tz_string(tz): 138 offset = None 139 if tz in UTC_ZONES: 140 offset = 0 141 else: 142 m = TIMEZONE_RE.search(tz) 143 if m: 144 offset = 3600 * int(m.group(2)) 145 if m.group(3): 146 offset = offset + 60 * int(m.group(3)) 147 if m.group(1) == '-': 148 offset = -offset 149 return offset 150 151def _str2time(day, mon, yr, hr, min, sec, tz): 152 yr = int(yr) 153 if yr > datetime.MAXYEAR: 154 return None 155 156 # translate month name to number 157 # month numbers start with 1 (January) 158 try: 159 mon = MONTHS_LOWER.index(mon.lower())+1 160 except ValueError: 161 # maybe it's already a number 162 try: 163 imon = int(mon) 164 except ValueError: 165 return None 166 if 1 <= imon <= 12: 167 mon = imon 168 else: 169 return None 170 171 # make sure clock elements are defined 172 if hr is None: hr = 0 173 if min is None: min = 0 174 if sec is None: sec = 0 175 176 day = int(day) 177 hr = int(hr) 178 min = int(min) 179 sec = int(sec) 180 181 if yr < 1000: 182 # find "obvious" year 183 cur_yr = time.localtime(time.time())[0] 184 m = cur_yr % 100 185 tmp = yr 186 yr = yr + cur_yr - m 187 m = m - tmp 188 if abs(m) > 50: 189 if m > 0: yr = yr + 100 190 else: yr = yr - 100 191 192 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 193 t = _timegm((yr, mon, day, hr, min, sec, tz)) 194 195 if t is not None: 196 # adjust time using timezone string, to get absolute time since epoch 197 if tz is None: 198 tz = "UTC" 199 tz = tz.upper() 200 offset = offset_from_tz_string(tz) 201 if offset is None: 202 return None 203 t = t - offset 204 205 return t 206 207STRICT_DATE_RE = re.compile( 208 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 209 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) 210WEEKDAY_RE = re.compile( 211 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) 212LOOSE_HTTP_DATE_RE = re.compile( 213 r"""^ 214 (\d\d?) # day 215 (?:\s+|[-\/]) 216 (\w+) # month 217 (?:\s+|[-\/]) 218 (\d+) # year 219 (?: 220 (?:\s+|:) # separator before clock 221 (\d\d?):(\d\d) # hour:min 222 (?::(\d\d))? # optional seconds 223 )? # optional clock 224 \s* 225 (?: 226 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone 227 \s* 228 )? 229 (?: 230 \(\w+\) # ASCII representation of timezone in parens. 231 \s* 232 )?$""", re.X | re.ASCII) 233def http2time(text): 234 """Returns time in seconds since epoch of time represented by a string. 235 236 Return value is an integer. 237 238 None is returned if the format of str is unrecognized, the time is outside 239 the representable range, or the timezone string is not recognized. If the 240 string contains no timezone, UTC is assumed. 241 242 The timezone in the string may be numerical (like "-0800" or "+0100") or a 243 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 244 timezone strings equivalent to UTC (zero offset) are known to the function. 245 246 The function loosely parses the following formats: 247 248 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 249 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 250 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 251 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 252 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 253 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 254 255 The parser ignores leading and trailing whitespace. The time may be 256 absent. 257 258 If the year is given with only 2 digits, the function will select the 259 century that makes the year closest to the current date. 260 261 """ 262 # fast exit for strictly conforming string 263 m = STRICT_DATE_RE.search(text) 264 if m: 265 g = m.groups() 266 mon = MONTHS_LOWER.index(g[1].lower()) + 1 267 tt = (int(g[2]), mon, int(g[0]), 268 int(g[3]), int(g[4]), float(g[5])) 269 return _timegm(tt) 270 271 # No, we need some messy parsing... 272 273 # clean up 274 text = text.lstrip() 275 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 276 277 # tz is time zone specifier string 278 day, mon, yr, hr, min, sec, tz = [None]*7 279 280 # loose regexp parse 281 m = LOOSE_HTTP_DATE_RE.search(text) 282 if m is not None: 283 day, mon, yr, hr, min, sec, tz = m.groups() 284 else: 285 return None # bad format 286 287 return _str2time(day, mon, yr, hr, min, sec, tz) 288 289ISO_DATE_RE = re.compile( 290 r"""^ 291 (\d{4}) # year 292 [-\/]? 293 (\d\d?) # numerical month 294 [-\/]? 295 (\d\d?) # day 296 (?: 297 (?:\s+|[-:Tt]) # separator before clock 298 (\d\d?):?(\d\d) # hour:min 299 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 300 )? # optional clock 301 \s* 302 (?: 303 ([-+]?\d\d?:?(:?\d\d)? 304 |Z|z) # timezone (Z is "zero meridian", i.e. GMT) 305 \s* 306 )?$""", re.X | re. ASCII) 307def iso2time(text): 308 """ 309 As for http2time, but parses the ISO 8601 formats: 310 311 1994-02-03 14:15:29 -0100 -- ISO 8601 format 312 1994-02-03 14:15:29 -- zone is optional 313 1994-02-03 -- only date 314 1994-02-03T14:15:29 -- Use T as separator 315 19940203T141529Z -- ISO 8601 compact format 316 19940203 -- only date 317 318 """ 319 # clean up 320 text = text.lstrip() 321 322 # tz is time zone specifier string 323 day, mon, yr, hr, min, sec, tz = [None]*7 324 325 # loose regexp parse 326 m = ISO_DATE_RE.search(text) 327 if m is not None: 328 # XXX there's an extra bit of the timezone I'm ignoring here: is 329 # this the right thing to do? 330 yr, mon, day, hr, min, sec, tz, _ = m.groups() 331 else: 332 return None # bad format 333 334 return _str2time(day, mon, yr, hr, min, sec, tz) 335 336 337# Header parsing 338# ----------------------------------------------------------------------------- 339 340def unmatched(match): 341 """Return unmatched part of re.Match object.""" 342 start, end = match.span(0) 343 return match.string[:start]+match.string[end:] 344 345HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 346HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 347HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 348HEADER_ESCAPE_RE = re.compile(r"\\(.)") 349def split_header_words(header_values): 350 r"""Parse header values into a list of lists containing key,value pairs. 351 352 The function knows how to deal with ",", ";" and "=" as well as quoted 353 values after "=". A list of space separated tokens are parsed as if they 354 were separated by ";". 355 356 If the header_values passed as argument contains multiple values, then they 357 are treated as if they were a single value separated by comma ",". 358 359 This means that this function is useful for parsing header fields that 360 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 361 the requirement for tokens). 362 363 headers = #header 364 header = (token | parameter) *( [";"] (token | parameter)) 365 366 token = 1*<any CHAR except CTLs or separators> 367 separators = "(" | ")" | "<" | ">" | "@" 368 | "," | ";" | ":" | "\" | <"> 369 | "/" | "[" | "]" | "?" | "=" 370 | "{" | "}" | SP | HT 371 372 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 373 qdtext = <any TEXT except <">> 374 quoted-pair = "\" CHAR 375 376 parameter = attribute "=" value 377 attribute = token 378 value = token | quoted-string 379 380 Each header is represented by a list of key/value pairs. The value for a 381 simple token (not part of a parameter) is None. Syntactically incorrect 382 headers will not necessarily be parsed as you would want. 383 384 This is easier to describe with some examples: 385 386 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 387 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 388 >>> split_header_words(['text/html; charset="iso-8859-1"']) 389 [[('text/html', None), ('charset', 'iso-8859-1')]] 390 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 391 [[('Basic', None), ('realm', '"foobar"')]] 392 393 """ 394 assert not isinstance(header_values, str) 395 result = [] 396 for text in header_values: 397 orig_text = text 398 pairs = [] 399 while text: 400 m = HEADER_TOKEN_RE.search(text) 401 if m: 402 text = unmatched(m) 403 name = m.group(1) 404 m = HEADER_QUOTED_VALUE_RE.search(text) 405 if m: # quoted value 406 text = unmatched(m) 407 value = m.group(1) 408 value = HEADER_ESCAPE_RE.sub(r"\1", value) 409 else: 410 m = HEADER_VALUE_RE.search(text) 411 if m: # unquoted value 412 text = unmatched(m) 413 value = m.group(1) 414 value = value.rstrip() 415 else: 416 # no value, a lone token 417 value = None 418 pairs.append((name, value)) 419 elif text.lstrip().startswith(","): 420 # concatenated headers, as per RFC 2616 section 4.2 421 text = text.lstrip()[1:] 422 if pairs: result.append(pairs) 423 pairs = [] 424 else: 425 # skip junk 426 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) 427 assert nr_junk_chars > 0, ( 428 "split_header_words bug: '%s', '%s', %s" % 429 (orig_text, text, pairs)) 430 text = non_junk 431 if pairs: result.append(pairs) 432 return result 433 434HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 435def join_header_words(lists): 436 """Do the inverse (almost) of the conversion done by split_header_words. 437 438 Takes a list of lists of (key, value) pairs and produces a single header 439 value. Attribute values are quoted if needed. 440 441 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) 442 'text/plain; charset="iso-8859-1"' 443 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) 444 'text/plain, charset="iso-8859-1"' 445 446 """ 447 headers = [] 448 for pairs in lists: 449 attr = [] 450 for k, v in pairs: 451 if v is not None: 452 if not re.search(r"^\w+$", v): 453 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 454 v = '"%s"' % v 455 k = "%s=%s" % (k, v) 456 attr.append(k) 457 if attr: headers.append("; ".join(attr)) 458 return ", ".join(headers) 459 460def strip_quotes(text): 461 if text.startswith('"'): 462 text = text[1:] 463 if text.endswith('"'): 464 text = text[:-1] 465 return text 466 467def parse_ns_headers(ns_headers): 468 """Ad-hoc parser for Netscape protocol cookie-attributes. 469 470 The old Netscape cookie format for Set-Cookie can for instance contain 471 an unquoted "," in the expires field, so we have to use this ad-hoc 472 parser instead of split_header_words. 473 474 XXX This may not make the best possible effort to parse all the crap 475 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 476 parser is probably better, so could do worse than following that if 477 this ever gives any trouble. 478 479 Currently, this is also used for parsing RFC 2109 cookies. 480 481 """ 482 known_attrs = ("expires", "domain", "path", "secure", 483 # RFC 2109 attrs (may turn up in Netscape cookies, too) 484 "version", "port", "max-age") 485 486 result = [] 487 for ns_header in ns_headers: 488 pairs = [] 489 version_set = False 490 491 # XXX: The following does not strictly adhere to RFCs in that empty 492 # names and values are legal (the former will only appear once and will 493 # be overwritten if multiple occurrences are present). This is 494 # mostly to deal with backwards compatibility. 495 for ii, param in enumerate(ns_header.split(';')): 496 param = param.strip() 497 498 key, sep, val = param.partition('=') 499 key = key.strip() 500 501 if not key: 502 if ii == 0: 503 break 504 else: 505 continue 506 507 # allow for a distinction between present and empty and missing 508 # altogether 509 val = val.strip() if sep else None 510 511 if ii != 0: 512 lc = key.lower() 513 if lc in known_attrs: 514 key = lc 515 516 if key == "version": 517 # This is an RFC 2109 cookie. 518 if val is not None: 519 val = strip_quotes(val) 520 version_set = True 521 elif key == "expires": 522 # convert expires date to seconds since epoch 523 if val is not None: 524 val = http2time(strip_quotes(val)) # None if invalid 525 pairs.append((key, val)) 526 527 if pairs: 528 if not version_set: 529 pairs.append(("version", "0")) 530 result.append(pairs) 531 532 return result 533 534 535IPV4_RE = re.compile(r"\.\d+$", re.ASCII) 536def is_HDN(text): 537 """Return True if text is a host domain name.""" 538 # XXX 539 # This may well be wrong. Which RFC is HDN defined in, if any (for 540 # the purposes of RFC 2965)? 541 # For the current implementation, what about IPv6? Remember to look 542 # at other uses of IPV4_RE also, if change this. 543 if IPV4_RE.search(text): 544 return False 545 if text == "": 546 return False 547 if text[0] == "." or text[-1] == ".": 548 return False 549 return True 550 551def domain_match(A, B): 552 """Return True if domain A domain-matches domain B, according to RFC 2965. 553 554 A and B may be host domain names or IP addresses. 555 556 RFC 2965, section 1: 557 558 Host names can be specified either as an IP address or a HDN string. 559 Sometimes we compare one host name with another. (Such comparisons SHALL 560 be case-insensitive.) Host A's name domain-matches host B's if 561 562 * their host name strings string-compare equal; or 563 564 * A is a HDN string and has the form NB, where N is a non-empty 565 name string, B has the form .B', and B' is a HDN string. (So, 566 x.y.com domain-matches .Y.com but not Y.com.) 567 568 Note that domain-match is not a commutative operation: a.b.c.com 569 domain-matches .c.com, but not the reverse. 570 571 """ 572 # Note that, if A or B are IP addresses, the only relevant part of the 573 # definition of the domain-match algorithm is the direct string-compare. 574 A = A.lower() 575 B = B.lower() 576 if A == B: 577 return True 578 if not is_HDN(A): 579 return False 580 i = A.rfind(B) 581 if i == -1 or i == 0: 582 # A does not have form NB, or N is the empty string 583 return False 584 if not B.startswith("."): 585 return False 586 if not is_HDN(B[1:]): 587 return False 588 return True 589 590def liberal_is_HDN(text): 591 """Return True if text is a sort-of-like a host domain name. 592 593 For accepting/blocking domains. 594 595 """ 596 if IPV4_RE.search(text): 597 return False 598 return True 599 600def user_domain_match(A, B): 601 """For blocking/accepting domains. 602 603 A and B may be host domain names or IP addresses. 604 605 """ 606 A = A.lower() 607 B = B.lower() 608 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 609 if A == B: 610 # equal IP addresses 611 return True 612 return False 613 initial_dot = B.startswith(".") 614 if initial_dot and A.endswith(B): 615 return True 616 if not initial_dot and A == B: 617 return True 618 return False 619 620cut_port_re = re.compile(r":\d+$", re.ASCII) 621def request_host(request): 622 """Return request-host, as defined by RFC 2965. 623 624 Variation from RFC: returned value is lowercased, for convenient 625 comparison. 626 627 """ 628 url = request.get_full_url() 629 host = urllib.parse.urlparse(url)[1] 630 if host == "": 631 host = request.get_header("Host", "") 632 633 # remove port, if present 634 host = cut_port_re.sub("", host, 1) 635 return host.lower() 636 637def eff_request_host(request): 638 """Return a tuple (request-host, effective request-host name). 639 640 As defined by RFC 2965, except both are lowercased. 641 642 """ 643 erhn = req_host = request_host(request) 644 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 645 erhn = req_host + ".local" 646 return req_host, erhn 647 648def request_path(request): 649 """Path component of request-URI, as defined by RFC 2965.""" 650 url = request.get_full_url() 651 parts = urllib.parse.urlsplit(url) 652 path = escape_path(parts.path) 653 if not path.startswith("/"): 654 # fix bad RFC 2396 absoluteURI 655 path = "/" + path 656 return path 657 658def request_port(request): 659 host = request.host 660 i = host.find(':') 661 if i >= 0: 662 port = host[i+1:] 663 try: 664 int(port) 665 except ValueError: 666 _debug("nonnumeric port: '%s'", port) 667 return None 668 else: 669 port = DEFAULT_HTTP_PORT 670 return port 671 672# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 673# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 674HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 675ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 676def uppercase_escaped_char(match): 677 return "%%%s" % match.group(1).upper() 678def escape_path(path): 679 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 680 # There's no knowing what character encoding was used to create URLs 681 # containing %-escapes, but since we have to pick one to escape invalid 682 # path characters, we pick UTF-8, as recommended in the HTML 4.0 683 # specification: 684 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 685 # And here, kind of: draft-fielding-uri-rfc2396bis-03 686 # (And in draft IRI specification: draft-duerst-iri-05) 687 # (And here, for new URI schemes: RFC 2718) 688 path = urllib.parse.quote(path, HTTP_PATH_SAFE) 689 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 690 return path 691 692def reach(h): 693 """Return reach of host h, as defined by RFC 2965, section 1. 694 695 The reach R of a host name H is defined as follows: 696 697 * If 698 699 - H is the host domain name of a host; and, 700 701 - H has the form A.B; and 702 703 - A has no embedded (that is, interior) dots; and 704 705 - B has at least one embedded dot, or B is the string "local". 706 then the reach of H is .B. 707 708 * Otherwise, the reach of H is H. 709 710 >>> reach("www.acme.com") 711 '.acme.com' 712 >>> reach("acme.com") 713 'acme.com' 714 >>> reach("acme.local") 715 '.local' 716 717 """ 718 i = h.find(".") 719 if i >= 0: 720 #a = h[:i] # this line is only here to show what a is 721 b = h[i+1:] 722 i = b.find(".") 723 if is_HDN(h) and (i >= 0 or b == "local"): 724 return "."+b 725 return h 726 727def is_third_party(request): 728 """ 729 730 RFC 2965, section 3.3.6: 731 732 An unverifiable transaction is to a third-party host if its request- 733 host U does not domain-match the reach R of the request-host O in the 734 origin transaction. 735 736 """ 737 req_host = request_host(request) 738 if not domain_match(req_host, reach(request.origin_req_host)): 739 return True 740 else: 741 return False 742 743 744class Cookie: 745 """HTTP Cookie. 746 747 This class represents both Netscape and RFC 2965 cookies. 748 749 This is deliberately a very simple class. It just holds attributes. It's 750 possible to construct Cookie instances that don't comply with the cookie 751 standards. CookieJar.make_cookies is the factory function for Cookie 752 objects -- it deals with cookie parsing, supplying defaults, and 753 normalising to the representation used in this class. CookiePolicy is 754 responsible for checking them to see whether they should be accepted from 755 and returned to the server. 756 757 Note that the port may be present in the headers, but unspecified ("Port" 758 rather than"Port=80", for example); if this is the case, port is None. 759 760 """ 761 762 def __init__(self, version, name, value, 763 port, port_specified, 764 domain, domain_specified, domain_initial_dot, 765 path, path_specified, 766 secure, 767 expires, 768 discard, 769 comment, 770 comment_url, 771 rest, 772 rfc2109=False, 773 ): 774 775 if version is not None: version = int(version) 776 if expires is not None: expires = int(float(expires)) 777 if port is None and port_specified is True: 778 raise ValueError("if port is None, port_specified must be false") 779 780 self.version = version 781 self.name = name 782 self.value = value 783 self.port = port 784 self.port_specified = port_specified 785 # normalise case, as per RFC 2965 section 3.3.3 786 self.domain = domain.lower() 787 self.domain_specified = domain_specified 788 # Sigh. We need to know whether the domain given in the 789 # cookie-attribute had an initial dot, in order to follow RFC 2965 790 # (as clarified in draft errata). Needed for the returned $Domain 791 # value. 792 self.domain_initial_dot = domain_initial_dot 793 self.path = path 794 self.path_specified = path_specified 795 self.secure = secure 796 self.expires = expires 797 self.discard = discard 798 self.comment = comment 799 self.comment_url = comment_url 800 self.rfc2109 = rfc2109 801 802 self._rest = copy.copy(rest) 803 804 def has_nonstandard_attr(self, name): 805 return name in self._rest 806 def get_nonstandard_attr(self, name, default=None): 807 return self._rest.get(name, default) 808 def set_nonstandard_attr(self, name, value): 809 self._rest[name] = value 810 811 def is_expired(self, now=None): 812 if now is None: now = time.time() 813 if (self.expires is not None) and (self.expires <= now): 814 return True 815 return False 816 817 def __str__(self): 818 if self.port is None: p = "" 819 else: p = ":"+self.port 820 limit = self.domain + p + self.path 821 if self.value is not None: 822 namevalue = "%s=%s" % (self.name, self.value) 823 else: 824 namevalue = self.name 825 return "<Cookie %s for %s>" % (namevalue, limit) 826 827 def __repr__(self): 828 args = [] 829 for name in ("version", "name", "value", 830 "port", "port_specified", 831 "domain", "domain_specified", "domain_initial_dot", 832 "path", "path_specified", 833 "secure", "expires", "discard", "comment", "comment_url", 834 ): 835 attr = getattr(self, name) 836 args.append("%s=%s" % (name, repr(attr))) 837 args.append("rest=%s" % repr(self._rest)) 838 args.append("rfc2109=%s" % repr(self.rfc2109)) 839 return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) 840 841 842class CookiePolicy: 843 """Defines which cookies get accepted from and returned to server. 844 845 May also modify cookies, though this is probably a bad idea. 846 847 The subclass DefaultCookiePolicy defines the standard rules for Netscape 848 and RFC 2965 cookies -- override that if you want a customized policy. 849 850 """ 851 def set_ok(self, cookie, request): 852 """Return true if (and only if) cookie should be accepted from server. 853 854 Currently, pre-expired cookies never get this far -- the CookieJar 855 class deletes such cookies itself. 856 857 """ 858 raise NotImplementedError() 859 860 def return_ok(self, cookie, request): 861 """Return true if (and only if) cookie should be returned to server.""" 862 raise NotImplementedError() 863 864 def domain_return_ok(self, domain, request): 865 """Return false if cookies should not be returned, given cookie domain. 866 """ 867 return True 868 869 def path_return_ok(self, path, request): 870 """Return false if cookies should not be returned, given cookie path. 871 """ 872 return True 873 874 875class DefaultCookiePolicy(CookiePolicy): 876 """Implements the standard rules for accepting and returning cookies.""" 877 878 DomainStrictNoDots = 1 879 DomainStrictNonDomain = 2 880 DomainRFC2965Match = 4 881 882 DomainLiberal = 0 883 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 884 885 def __init__(self, 886 blocked_domains=None, allowed_domains=None, 887 netscape=True, rfc2965=False, 888 rfc2109_as_netscape=None, 889 hide_cookie2=False, 890 strict_domain=False, 891 strict_rfc2965_unverifiable=True, 892 strict_ns_unverifiable=False, 893 strict_ns_domain=DomainLiberal, 894 strict_ns_set_initial_dollar=False, 895 strict_ns_set_path=False, 896 secure_protocols=("https", "wss") 897 ): 898 """Constructor arguments should be passed as keyword arguments only.""" 899 self.netscape = netscape 900 self.rfc2965 = rfc2965 901 self.rfc2109_as_netscape = rfc2109_as_netscape 902 self.hide_cookie2 = hide_cookie2 903 self.strict_domain = strict_domain 904 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 905 self.strict_ns_unverifiable = strict_ns_unverifiable 906 self.strict_ns_domain = strict_ns_domain 907 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 908 self.strict_ns_set_path = strict_ns_set_path 909 self.secure_protocols = secure_protocols 910 911 if blocked_domains is not None: 912 self._blocked_domains = tuple(blocked_domains) 913 else: 914 self._blocked_domains = () 915 916 if allowed_domains is not None: 917 allowed_domains = tuple(allowed_domains) 918 self._allowed_domains = allowed_domains 919 920 def blocked_domains(self): 921 """Return the sequence of blocked domains (as a tuple).""" 922 return self._blocked_domains 923 def set_blocked_domains(self, blocked_domains): 924 """Set the sequence of blocked domains.""" 925 self._blocked_domains = tuple(blocked_domains) 926 927 def is_blocked(self, domain): 928 for blocked_domain in self._blocked_domains: 929 if user_domain_match(domain, blocked_domain): 930 return True 931 return False 932 933 def allowed_domains(self): 934 """Return None, or the sequence of allowed domains (as a tuple).""" 935 return self._allowed_domains 936 def set_allowed_domains(self, allowed_domains): 937 """Set the sequence of allowed domains, or None.""" 938 if allowed_domains is not None: 939 allowed_domains = tuple(allowed_domains) 940 self._allowed_domains = allowed_domains 941 942 def is_not_allowed(self, domain): 943 if self._allowed_domains is None: 944 return False 945 for allowed_domain in self._allowed_domains: 946 if user_domain_match(domain, allowed_domain): 947 return False 948 return True 949 950 def set_ok(self, cookie, request): 951 """ 952 If you override .set_ok(), be sure to call this method. If it returns 953 false, so should your subclass (assuming your subclass wants to be more 954 strict about which cookies to accept). 955 956 """ 957 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 958 959 assert cookie.name is not None 960 961 for n in "version", "verifiability", "name", "path", "domain", "port": 962 fn_name = "set_ok_"+n 963 fn = getattr(self, fn_name) 964 if not fn(cookie, request): 965 return False 966 967 return True 968 969 def set_ok_version(self, cookie, request): 970 if cookie.version is None: 971 # Version is always set to 0 by parse_ns_headers if it's a Netscape 972 # cookie, so this must be an invalid RFC 2965 cookie. 973 _debug(" Set-Cookie2 without version attribute (%s=%s)", 974 cookie.name, cookie.value) 975 return False 976 if cookie.version > 0 and not self.rfc2965: 977 _debug(" RFC 2965 cookies are switched off") 978 return False 979 elif cookie.version == 0 and not self.netscape: 980 _debug(" Netscape cookies are switched off") 981 return False 982 return True 983 984 def set_ok_verifiability(self, cookie, request): 985 if request.unverifiable and is_third_party(request): 986 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 987 _debug(" third-party RFC 2965 cookie during " 988 "unverifiable transaction") 989 return False 990 elif cookie.version == 0 and self.strict_ns_unverifiable: 991 _debug(" third-party Netscape cookie during " 992 "unverifiable transaction") 993 return False 994 return True 995 996 def set_ok_name(self, cookie, request): 997 # Try and stop servers setting V0 cookies designed to hack other 998 # servers that know both V0 and V1 protocols. 999 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 1000 cookie.name.startswith("$")): 1001 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 1002 return False 1003 return True 1004 1005 def set_ok_path(self, cookie, request): 1006 if cookie.path_specified: 1007 req_path = request_path(request) 1008 if ((cookie.version > 0 or 1009 (cookie.version == 0 and self.strict_ns_set_path)) and 1010 not self.path_return_ok(cookie.path, request)): 1011 _debug(" path attribute %s is not a prefix of request " 1012 "path %s", cookie.path, req_path) 1013 return False 1014 return True 1015 1016 def set_ok_domain(self, cookie, request): 1017 if self.is_blocked(cookie.domain): 1018 _debug(" domain %s is in user block-list", cookie.domain) 1019 return False 1020 if self.is_not_allowed(cookie.domain): 1021 _debug(" domain %s is not in user allow-list", cookie.domain) 1022 return False 1023 if cookie.domain_specified: 1024 req_host, erhn = eff_request_host(request) 1025 domain = cookie.domain 1026 if self.strict_domain and (domain.count(".") >= 2): 1027 # XXX This should probably be compared with the Konqueror 1028 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1029 # losing battle. 1030 i = domain.rfind(".") 1031 j = domain.rfind(".", 0, i) 1032 if j == 0: # domain like .foo.bar 1033 tld = domain[i+1:] 1034 sld = domain[j+1:i] 1035 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1036 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1037 "info", "jobs", "mobi", "museum", "name", "pro", 1038 "travel", "eu") and len(tld) == 2: 1039 # domain like .co.uk 1040 _debug(" country-code second level domain %s", domain) 1041 return False 1042 if domain.startswith("."): 1043 undotted_domain = domain[1:] 1044 else: 1045 undotted_domain = domain 1046 embedded_dots = (undotted_domain.find(".") >= 0) 1047 if not embedded_dots and domain != ".local": 1048 _debug(" non-local domain %s contains no embedded dot", 1049 domain) 1050 return False 1051 if cookie.version == 0: 1052 if (not erhn.endswith(domain) and 1053 (not erhn.startswith(".") and 1054 not ("."+erhn).endswith(domain))): 1055 _debug(" effective request-host %s (even with added " 1056 "initial dot) does not end with %s", 1057 erhn, domain) 1058 return False 1059 if (cookie.version > 0 or 1060 (self.strict_ns_domain & self.DomainRFC2965Match)): 1061 if not domain_match(erhn, domain): 1062 _debug(" effective request-host %s does not domain-match " 1063 "%s", erhn, domain) 1064 return False 1065 if (cookie.version > 0 or 1066 (self.strict_ns_domain & self.DomainStrictNoDots)): 1067 host_prefix = req_host[:-len(domain)] 1068 if (host_prefix.find(".") >= 0 and 1069 not IPV4_RE.search(req_host)): 1070 _debug(" host prefix %s for domain %s contains a dot", 1071 host_prefix, domain) 1072 return False 1073 return True 1074 1075 def set_ok_port(self, cookie, request): 1076 if cookie.port_specified: 1077 req_port = request_port(request) 1078 if req_port is None: 1079 req_port = "80" 1080 else: 1081 req_port = str(req_port) 1082 for p in cookie.port.split(","): 1083 try: 1084 int(p) 1085 except ValueError: 1086 _debug(" bad port %s (not numeric)", p) 1087 return False 1088 if p == req_port: 1089 break 1090 else: 1091 _debug(" request port (%s) not found in %s", 1092 req_port, cookie.port) 1093 return False 1094 return True 1095 1096 def return_ok(self, cookie, request): 1097 """ 1098 If you override .return_ok(), be sure to call this method. If it 1099 returns false, so should your subclass (assuming your subclass wants to 1100 be more strict about which cookies to return). 1101 1102 """ 1103 # Path has already been checked by .path_return_ok(), and domain 1104 # blocking done by .domain_return_ok(). 1105 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1106 1107 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1108 fn_name = "return_ok_"+n 1109 fn = getattr(self, fn_name) 1110 if not fn(cookie, request): 1111 return False 1112 return True 1113 1114 def return_ok_version(self, cookie, request): 1115 if cookie.version > 0 and not self.rfc2965: 1116 _debug(" RFC 2965 cookies are switched off") 1117 return False 1118 elif cookie.version == 0 and not self.netscape: 1119 _debug(" Netscape cookies are switched off") 1120 return False 1121 return True 1122 1123 def return_ok_verifiability(self, cookie, request): 1124 if request.unverifiable and is_third_party(request): 1125 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1126 _debug(" third-party RFC 2965 cookie during unverifiable " 1127 "transaction") 1128 return False 1129 elif cookie.version == 0 and self.strict_ns_unverifiable: 1130 _debug(" third-party Netscape cookie during unverifiable " 1131 "transaction") 1132 return False 1133 return True 1134 1135 def return_ok_secure(self, cookie, request): 1136 if cookie.secure and request.type not in self.secure_protocols: 1137 _debug(" secure cookie with non-secure request") 1138 return False 1139 return True 1140 1141 def return_ok_expires(self, cookie, request): 1142 if cookie.is_expired(self._now): 1143 _debug(" cookie expired") 1144 return False 1145 return True 1146 1147 def return_ok_port(self, cookie, request): 1148 if cookie.port: 1149 req_port = request_port(request) 1150 if req_port is None: 1151 req_port = "80" 1152 for p in cookie.port.split(","): 1153 if p == req_port: 1154 break 1155 else: 1156 _debug(" request port %s does not match cookie port %s", 1157 req_port, cookie.port) 1158 return False 1159 return True 1160 1161 def return_ok_domain(self, cookie, request): 1162 req_host, erhn = eff_request_host(request) 1163 domain = cookie.domain 1164 1165 if domain and not domain.startswith("."): 1166 dotdomain = "." + domain 1167 else: 1168 dotdomain = domain 1169 1170 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1171 if (cookie.version == 0 and 1172 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1173 not cookie.domain_specified and domain != erhn): 1174 _debug(" cookie with unspecified domain does not string-compare " 1175 "equal to request domain") 1176 return False 1177 1178 if cookie.version > 0 and not domain_match(erhn, domain): 1179 _debug(" effective request-host name %s does not domain-match " 1180 "RFC 2965 cookie domain %s", erhn, domain) 1181 return False 1182 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain): 1183 _debug(" request-host %s does not match Netscape cookie domain " 1184 "%s", req_host, domain) 1185 return False 1186 return True 1187 1188 def domain_return_ok(self, domain, request): 1189 # Liberal check of. This is here as an optimization to avoid 1190 # having to load lots of MSIE cookie files unless necessary. 1191 req_host, erhn = eff_request_host(request) 1192 if not req_host.startswith("."): 1193 req_host = "."+req_host 1194 if not erhn.startswith("."): 1195 erhn = "."+erhn 1196 if domain and not domain.startswith("."): 1197 dotdomain = "." + domain 1198 else: 1199 dotdomain = domain 1200 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)): 1201 #_debug(" request domain %s does not match cookie domain %s", 1202 # req_host, domain) 1203 return False 1204 1205 if self.is_blocked(domain): 1206 _debug(" domain %s is in user block-list", domain) 1207 return False 1208 if self.is_not_allowed(domain): 1209 _debug(" domain %s is not in user allow-list", domain) 1210 return False 1211 1212 return True 1213 1214 def path_return_ok(self, path, request): 1215 _debug("- checking cookie path=%s", path) 1216 req_path = request_path(request) 1217 pathlen = len(path) 1218 if req_path == path: 1219 return True 1220 elif (req_path.startswith(path) and 1221 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")): 1222 return True 1223 1224 _debug(" %s does not path-match %s", req_path, path) 1225 return False 1226 1227def vals_sorted_by_key(adict): 1228 keys = sorted(adict.keys()) 1229 return map(adict.get, keys) 1230 1231def deepvalues(mapping): 1232 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1233 values = vals_sorted_by_key(mapping) 1234 for obj in values: 1235 mapping = False 1236 try: 1237 obj.items 1238 except AttributeError: 1239 pass 1240 else: 1241 mapping = True 1242 yield from deepvalues(obj) 1243 if not mapping: 1244 yield obj 1245 1246 1247# Used as second parameter to dict.get() method, to distinguish absent 1248# dict key from one with a None value. 1249class Absent: pass 1250 1251class CookieJar: 1252 """Collection of HTTP cookies. 1253 1254 You may not need to know about this class: try 1255 urllib.request.build_opener(HTTPCookieProcessor).open(url). 1256 """ 1257 1258 non_word_re = re.compile(r"\W") 1259 quote_re = re.compile(r"([\"\\])") 1260 strict_domain_re = re.compile(r"\.?[^.]*") 1261 domain_re = re.compile(r"[^.]*") 1262 dots_re = re.compile(r"^\.+") 1263 1264 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) 1265 1266 def __init__(self, policy=None): 1267 if policy is None: 1268 policy = DefaultCookiePolicy() 1269 self._policy = policy 1270 1271 self._cookies_lock = _threading.RLock() 1272 self._cookies = {} 1273 1274 def set_policy(self, policy): 1275 self._policy = policy 1276 1277 def _cookies_for_domain(self, domain, request): 1278 cookies = [] 1279 if not self._policy.domain_return_ok(domain, request): 1280 return [] 1281 _debug("Checking %s for cookies to return", domain) 1282 cookies_by_path = self._cookies[domain] 1283 for path in cookies_by_path.keys(): 1284 if not self._policy.path_return_ok(path, request): 1285 continue 1286 cookies_by_name = cookies_by_path[path] 1287 for cookie in cookies_by_name.values(): 1288 if not self._policy.return_ok(cookie, request): 1289 _debug(" not returning cookie") 1290 continue 1291 _debug(" it's a match") 1292 cookies.append(cookie) 1293 return cookies 1294 1295 def _cookies_for_request(self, request): 1296 """Return a list of cookies to be returned to server.""" 1297 cookies = [] 1298 for domain in self._cookies.keys(): 1299 cookies.extend(self._cookies_for_domain(domain, request)) 1300 return cookies 1301 1302 def _cookie_attrs(self, cookies): 1303 """Return a list of cookie-attributes to be returned to server. 1304 1305 like ['foo="bar"; $Path="/"', ...] 1306 1307 The $Version attribute is also added when appropriate (currently only 1308 once per request). 1309 1310 """ 1311 # add cookies in order of most specific (ie. longest) path first 1312 cookies.sort(key=lambda a: len(a.path), reverse=True) 1313 1314 version_set = False 1315 1316 attrs = [] 1317 for cookie in cookies: 1318 # set version of Cookie header 1319 # XXX 1320 # What should it be if multiple matching Set-Cookie headers have 1321 # different versions themselves? 1322 # Answer: there is no answer; was supposed to be settled by 1323 # RFC 2965 errata, but that may never appear... 1324 version = cookie.version 1325 if not version_set: 1326 version_set = True 1327 if version > 0: 1328 attrs.append("$Version=%s" % version) 1329 1330 # quote cookie value if necessary 1331 # (not for Netscape protocol, which already has any quotes 1332 # intact, due to the poorly-specified Netscape Cookie: syntax) 1333 if ((cookie.value is not None) and 1334 self.non_word_re.search(cookie.value) and version > 0): 1335 value = self.quote_re.sub(r"\\\1", cookie.value) 1336 else: 1337 value = cookie.value 1338 1339 # add cookie-attributes to be returned in Cookie header 1340 if cookie.value is None: 1341 attrs.append(cookie.name) 1342 else: 1343 attrs.append("%s=%s" % (cookie.name, value)) 1344 if version > 0: 1345 if cookie.path_specified: 1346 attrs.append('$Path="%s"' % cookie.path) 1347 if cookie.domain.startswith("."): 1348 domain = cookie.domain 1349 if (not cookie.domain_initial_dot and 1350 domain.startswith(".")): 1351 domain = domain[1:] 1352 attrs.append('$Domain="%s"' % domain) 1353 if cookie.port is not None: 1354 p = "$Port" 1355 if cookie.port_specified: 1356 p = p + ('="%s"' % cookie.port) 1357 attrs.append(p) 1358 1359 return attrs 1360 1361 def add_cookie_header(self, request): 1362 """Add correct Cookie: header to request (urllib.request.Request object). 1363 1364 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1365 1366 """ 1367 _debug("add_cookie_header") 1368 self._cookies_lock.acquire() 1369 try: 1370 1371 self._policy._now = self._now = int(time.time()) 1372 1373 cookies = self._cookies_for_request(request) 1374 1375 attrs = self._cookie_attrs(cookies) 1376 if attrs: 1377 if not request.has_header("Cookie"): 1378 request.add_unredirected_header( 1379 "Cookie", "; ".join(attrs)) 1380 1381 # if necessary, advertise that we know RFC 2965 1382 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1383 not request.has_header("Cookie2")): 1384 for cookie in cookies: 1385 if cookie.version != 1: 1386 request.add_unredirected_header("Cookie2", '$Version="1"') 1387 break 1388 1389 finally: 1390 self._cookies_lock.release() 1391 1392 self.clear_expired_cookies() 1393 1394 def _normalized_cookie_tuples(self, attrs_set): 1395 """Return list of tuples containing normalised cookie information. 1396 1397 attrs_set is the list of lists of key,value pairs extracted from 1398 the Set-Cookie or Set-Cookie2 headers. 1399 1400 Tuples are name, value, standard, rest, where name and value are the 1401 cookie name and value, standard is a dictionary containing the standard 1402 cookie-attributes (discard, secure, version, expires or max-age, 1403 domain, path and port) and rest is a dictionary containing the rest of 1404 the cookie-attributes. 1405 1406 """ 1407 cookie_tuples = [] 1408 1409 boolean_attrs = "discard", "secure" 1410 value_attrs = ("version", 1411 "expires", "max-age", 1412 "domain", "path", "port", 1413 "comment", "commenturl") 1414 1415 for cookie_attrs in attrs_set: 1416 name, value = cookie_attrs[0] 1417 1418 # Build dictionary of standard cookie-attributes (standard) and 1419 # dictionary of other cookie-attributes (rest). 1420 1421 # Note: expiry time is normalised to seconds since epoch. V0 1422 # cookies should have the Expires cookie-attribute, and V1 cookies 1423 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1424 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1425 # accept either (but prefer Max-Age). 1426 max_age_set = False 1427 1428 bad_cookie = False 1429 1430 standard = {} 1431 rest = {} 1432 for k, v in cookie_attrs[1:]: 1433 lc = k.lower() 1434 # don't lose case distinction for unknown fields 1435 if lc in value_attrs or lc in boolean_attrs: 1436 k = lc 1437 if k in boolean_attrs and v is None: 1438 # boolean cookie-attribute is present, but has no value 1439 # (like "discard", rather than "port=80") 1440 v = True 1441 if k in standard: 1442 # only first value is significant 1443 continue 1444 if k == "domain": 1445 if v is None: 1446 _debug(" missing value for domain attribute") 1447 bad_cookie = True 1448 break 1449 # RFC 2965 section 3.3.3 1450 v = v.lower() 1451 if k == "expires": 1452 if max_age_set: 1453 # Prefer max-age to expires (like Mozilla) 1454 continue 1455 if v is None: 1456 _debug(" missing or invalid value for expires " 1457 "attribute: treating as session cookie") 1458 continue 1459 if k == "max-age": 1460 max_age_set = True 1461 try: 1462 v = int(v) 1463 except ValueError: 1464 _debug(" missing or invalid (non-numeric) value for " 1465 "max-age attribute") 1466 bad_cookie = True 1467 break 1468 # convert RFC 2965 Max-Age to seconds since epoch 1469 # XXX Strictly you're supposed to follow RFC 2616 1470 # age-calculation rules. Remember that zero Max-Age 1471 # is a request to discard (old and new) cookie, though. 1472 k = "expires" 1473 v = self._now + v 1474 if (k in value_attrs) or (k in boolean_attrs): 1475 if (v is None and 1476 k not in ("port", "comment", "commenturl")): 1477 _debug(" missing value for %s attribute" % k) 1478 bad_cookie = True 1479 break 1480 standard[k] = v 1481 else: 1482 rest[k] = v 1483 1484 if bad_cookie: 1485 continue 1486 1487 cookie_tuples.append((name, value, standard, rest)) 1488 1489 return cookie_tuples 1490 1491 def _cookie_from_cookie_tuple(self, tup, request): 1492 # standard is dict of standard cookie-attributes, rest is dict of the 1493 # rest of them 1494 name, value, standard, rest = tup 1495 1496 domain = standard.get("domain", Absent) 1497 path = standard.get("path", Absent) 1498 port = standard.get("port", Absent) 1499 expires = standard.get("expires", Absent) 1500 1501 # set the easy defaults 1502 version = standard.get("version", None) 1503 if version is not None: 1504 try: 1505 version = int(version) 1506 except ValueError: 1507 return None # invalid version, ignore cookie 1508 secure = standard.get("secure", False) 1509 # (discard is also set if expires is Absent) 1510 discard = standard.get("discard", False) 1511 comment = standard.get("comment", None) 1512 comment_url = standard.get("commenturl", None) 1513 1514 # set default path 1515 if path is not Absent and path != "": 1516 path_specified = True 1517 path = escape_path(path) 1518 else: 1519 path_specified = False 1520 path = request_path(request) 1521 i = path.rfind("/") 1522 if i != -1: 1523 if version == 0: 1524 # Netscape spec parts company from reality here 1525 path = path[:i] 1526 else: 1527 path = path[:i+1] 1528 if len(path) == 0: path = "/" 1529 1530 # set default domain 1531 domain_specified = domain is not Absent 1532 # but first we have to remember whether it starts with a dot 1533 domain_initial_dot = False 1534 if domain_specified: 1535 domain_initial_dot = bool(domain.startswith(".")) 1536 if domain is Absent: 1537 req_host, erhn = eff_request_host(request) 1538 domain = erhn 1539 elif not domain.startswith("."): 1540 domain = "."+domain 1541 1542 # set default port 1543 port_specified = False 1544 if port is not Absent: 1545 if port is None: 1546 # Port attr present, but has no value: default to request port. 1547 # Cookie should then only be sent back on that port. 1548 port = request_port(request) 1549 else: 1550 port_specified = True 1551 port = re.sub(r"\s+", "", port) 1552 else: 1553 # No port attr present. Cookie can be sent back on any port. 1554 port = None 1555 1556 # set default expires and discard 1557 if expires is Absent: 1558 expires = None 1559 discard = True 1560 elif expires <= self._now: 1561 # Expiry date in past is request to delete cookie. This can't be 1562 # in DefaultCookiePolicy, because can't delete cookies there. 1563 try: 1564 self.clear(domain, path, name) 1565 except KeyError: 1566 pass 1567 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1568 domain, path, name) 1569 return None 1570 1571 return Cookie(version, 1572 name, value, 1573 port, port_specified, 1574 domain, domain_specified, domain_initial_dot, 1575 path, path_specified, 1576 secure, 1577 expires, 1578 discard, 1579 comment, 1580 comment_url, 1581 rest) 1582 1583 def _cookies_from_attrs_set(self, attrs_set, request): 1584 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1585 1586 cookies = [] 1587 for tup in cookie_tuples: 1588 cookie = self._cookie_from_cookie_tuple(tup, request) 1589 if cookie: cookies.append(cookie) 1590 return cookies 1591 1592 def _process_rfc2109_cookies(self, cookies): 1593 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1594 if rfc2109_as_ns is None: 1595 rfc2109_as_ns = not self._policy.rfc2965 1596 for cookie in cookies: 1597 if cookie.version == 1: 1598 cookie.rfc2109 = True 1599 if rfc2109_as_ns: 1600 # treat 2109 cookies as Netscape cookies rather than 1601 # as RFC2965 cookies 1602 cookie.version = 0 1603 1604 def make_cookies(self, response, request): 1605 """Return sequence of Cookie objects extracted from response object.""" 1606 # get cookie-attributes for RFC 2965 and Netscape protocols 1607 headers = response.info() 1608 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) 1609 ns_hdrs = headers.get_all("Set-Cookie", []) 1610 self._policy._now = self._now = int(time.time()) 1611 1612 rfc2965 = self._policy.rfc2965 1613 netscape = self._policy.netscape 1614 1615 if ((not rfc2965_hdrs and not ns_hdrs) or 1616 (not ns_hdrs and not rfc2965) or 1617 (not rfc2965_hdrs and not netscape) or 1618 (not netscape and not rfc2965)): 1619 return [] # no relevant cookie headers: quick exit 1620 1621 try: 1622 cookies = self._cookies_from_attrs_set( 1623 split_header_words(rfc2965_hdrs), request) 1624 except Exception: 1625 _warn_unhandled_exception() 1626 cookies = [] 1627 1628 if ns_hdrs and netscape: 1629 try: 1630 # RFC 2109 and Netscape cookies 1631 ns_cookies = self._cookies_from_attrs_set( 1632 parse_ns_headers(ns_hdrs), request) 1633 except Exception: 1634 _warn_unhandled_exception() 1635 ns_cookies = [] 1636 self._process_rfc2109_cookies(ns_cookies) 1637 1638 # Look for Netscape cookies (from Set-Cookie headers) that match 1639 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1640 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1641 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1642 # bundled in with the Netscape cookies for this purpose, which is 1643 # reasonable behaviour. 1644 if rfc2965: 1645 lookup = {} 1646 for cookie in cookies: 1647 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1648 1649 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1650 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1651 return key not in lookup 1652 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1653 1654 if ns_cookies: 1655 cookies.extend(ns_cookies) 1656 1657 return cookies 1658 1659 def set_cookie_if_ok(self, cookie, request): 1660 """Set a cookie if policy says it's OK to do so.""" 1661 self._cookies_lock.acquire() 1662 try: 1663 self._policy._now = self._now = int(time.time()) 1664 1665 if self._policy.set_ok(cookie, request): 1666 self.set_cookie(cookie) 1667 1668 1669 finally: 1670 self._cookies_lock.release() 1671 1672 def set_cookie(self, cookie): 1673 """Set a cookie, without checking whether or not it should be set.""" 1674 c = self._cookies 1675 self._cookies_lock.acquire() 1676 try: 1677 if cookie.domain not in c: c[cookie.domain] = {} 1678 c2 = c[cookie.domain] 1679 if cookie.path not in c2: c2[cookie.path] = {} 1680 c3 = c2[cookie.path] 1681 c3[cookie.name] = cookie 1682 finally: 1683 self._cookies_lock.release() 1684 1685 def extract_cookies(self, response, request): 1686 """Extract cookies from response, where allowable given the request.""" 1687 _debug("extract_cookies: %s", response.info()) 1688 self._cookies_lock.acquire() 1689 try: 1690 for cookie in self.make_cookies(response, request): 1691 if self._policy.set_ok(cookie, request): 1692 _debug(" setting cookie: %s", cookie) 1693 self.set_cookie(cookie) 1694 finally: 1695 self._cookies_lock.release() 1696 1697 def clear(self, domain=None, path=None, name=None): 1698 """Clear some cookies. 1699 1700 Invoking this method without arguments will clear all cookies. If 1701 given a single argument, only cookies belonging to that domain will be 1702 removed. If given two arguments, cookies belonging to the specified 1703 path within that domain are removed. If given three arguments, then 1704 the cookie with the specified name, path and domain is removed. 1705 1706 Raises KeyError if no matching cookie exists. 1707 1708 """ 1709 if name is not None: 1710 if (domain is None) or (path is None): 1711 raise ValueError( 1712 "domain and path must be given to remove a cookie by name") 1713 del self._cookies[domain][path][name] 1714 elif path is not None: 1715 if domain is None: 1716 raise ValueError( 1717 "domain must be given to remove cookies by path") 1718 del self._cookies[domain][path] 1719 elif domain is not None: 1720 del self._cookies[domain] 1721 else: 1722 self._cookies = {} 1723 1724 def clear_session_cookies(self): 1725 """Discard all session cookies. 1726 1727 Note that the .save() method won't save session cookies anyway, unless 1728 you ask otherwise by passing a true ignore_discard argument. 1729 1730 """ 1731 self._cookies_lock.acquire() 1732 try: 1733 for cookie in self: 1734 if cookie.discard: 1735 self.clear(cookie.domain, cookie.path, cookie.name) 1736 finally: 1737 self._cookies_lock.release() 1738 1739 def clear_expired_cookies(self): 1740 """Discard all expired cookies. 1741 1742 You probably don't need to call this method: expired cookies are never 1743 sent back to the server (provided you're using DefaultCookiePolicy), 1744 this method is called by CookieJar itself every so often, and the 1745 .save() method won't save expired cookies anyway (unless you ask 1746 otherwise by passing a true ignore_expires argument). 1747 1748 """ 1749 self._cookies_lock.acquire() 1750 try: 1751 now = time.time() 1752 for cookie in self: 1753 if cookie.is_expired(now): 1754 self.clear(cookie.domain, cookie.path, cookie.name) 1755 finally: 1756 self._cookies_lock.release() 1757 1758 def __iter__(self): 1759 return deepvalues(self._cookies) 1760 1761 def __len__(self): 1762 """Return number of contained cookies.""" 1763 i = 0 1764 for cookie in self: i = i + 1 1765 return i 1766 1767 def __repr__(self): 1768 r = [] 1769 for cookie in self: r.append(repr(cookie)) 1770 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1771 1772 def __str__(self): 1773 r = [] 1774 for cookie in self: r.append(str(cookie)) 1775 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1776 1777 1778# derives from OSError for backwards-compatibility with Python 2.4.0 1779class LoadError(OSError): pass 1780 1781class FileCookieJar(CookieJar): 1782 """CookieJar that can be loaded from and saved to a file.""" 1783 1784 def __init__(self, filename=None, delayload=False, policy=None): 1785 """ 1786 Cookies are NOT loaded from the named file until either the .load() or 1787 .revert() method is called. 1788 1789 """ 1790 CookieJar.__init__(self, policy) 1791 if filename is not None: 1792 filename = os.fspath(filename) 1793 self.filename = filename 1794 self.delayload = bool(delayload) 1795 1796 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1797 """Save cookies to a file.""" 1798 raise NotImplementedError() 1799 1800 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1801 """Load cookies from a file.""" 1802 if filename is None: 1803 if self.filename is not None: filename = self.filename 1804 else: raise ValueError(MISSING_FILENAME_TEXT) 1805 1806 with open(filename) as f: 1807 self._really_load(f, filename, ignore_discard, ignore_expires) 1808 1809 def revert(self, filename=None, 1810 ignore_discard=False, ignore_expires=False): 1811 """Clear all cookies and reload cookies from a saved file. 1812 1813 Raises LoadError (or OSError) if reversion is not successful; the 1814 object's state will not be altered if this happens. 1815 1816 """ 1817 if filename is None: 1818 if self.filename is not None: filename = self.filename 1819 else: raise ValueError(MISSING_FILENAME_TEXT) 1820 1821 self._cookies_lock.acquire() 1822 try: 1823 1824 old_state = copy.deepcopy(self._cookies) 1825 self._cookies = {} 1826 try: 1827 self.load(filename, ignore_discard, ignore_expires) 1828 except OSError: 1829 self._cookies = old_state 1830 raise 1831 1832 finally: 1833 self._cookies_lock.release() 1834 1835 1836def lwp_cookie_str(cookie): 1837 """Return string representation of Cookie in the LWP cookie file format. 1838 1839 Actually, the format is extended a bit -- see module docstring. 1840 1841 """ 1842 h = [(cookie.name, cookie.value), 1843 ("path", cookie.path), 1844 ("domain", cookie.domain)] 1845 if cookie.port is not None: h.append(("port", cookie.port)) 1846 if cookie.path_specified: h.append(("path_spec", None)) 1847 if cookie.port_specified: h.append(("port_spec", None)) 1848 if cookie.domain_initial_dot: h.append(("domain_dot", None)) 1849 if cookie.secure: h.append(("secure", None)) 1850 if cookie.expires: h.append(("expires", 1851 time2isoz(float(cookie.expires)))) 1852 if cookie.discard: h.append(("discard", None)) 1853 if cookie.comment: h.append(("comment", cookie.comment)) 1854 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) 1855 1856 keys = sorted(cookie._rest.keys()) 1857 for k in keys: 1858 h.append((k, str(cookie._rest[k]))) 1859 1860 h.append(("version", str(cookie.version))) 1861 1862 return join_header_words([h]) 1863 1864class LWPCookieJar(FileCookieJar): 1865 """ 1866 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. 1867 "Set-Cookie3" is the format used by the libwww-perl library, not known 1868 to be compatible with any browser, but which is easy to read and 1869 doesn't lose information about RFC 2965 cookies. 1870 1871 Additional methods 1872 1873 as_lwp_str(ignore_discard=True, ignore_expired=True) 1874 1875 """ 1876 1877 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): 1878 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. 1879 1880 ignore_discard and ignore_expires: see docstring for FileCookieJar.save 1881 1882 """ 1883 now = time.time() 1884 r = [] 1885 for cookie in self: 1886 if not ignore_discard and cookie.discard: 1887 continue 1888 if not ignore_expires and cookie.is_expired(now): 1889 continue 1890 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) 1891 return "\n".join(r+[""]) 1892 1893 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1894 if filename is None: 1895 if self.filename is not None: filename = self.filename 1896 else: raise ValueError(MISSING_FILENAME_TEXT) 1897 1898 with open(filename, "w") as f: 1899 # There really isn't an LWP Cookies 2.0 format, but this indicates 1900 # that there is extra information in here (domain_dot and 1901 # port_spec) while still being compatible with libwww-perl, I hope. 1902 f.write("#LWP-Cookies-2.0\n") 1903 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) 1904 1905 def _really_load(self, f, filename, ignore_discard, ignore_expires): 1906 magic = f.readline() 1907 if not self.magic_re.search(magic): 1908 msg = ("%r does not look like a Set-Cookie3 (LWP) format " 1909 "file" % filename) 1910 raise LoadError(msg) 1911 1912 now = time.time() 1913 1914 header = "Set-Cookie3:" 1915 boolean_attrs = ("port_spec", "path_spec", "domain_dot", 1916 "secure", "discard") 1917 value_attrs = ("version", 1918 "port", "path", "domain", 1919 "expires", 1920 "comment", "commenturl") 1921 1922 try: 1923 while 1: 1924 line = f.readline() 1925 if line == "": break 1926 if not line.startswith(header): 1927 continue 1928 line = line[len(header):].strip() 1929 1930 for data in split_header_words([line]): 1931 name, value = data[0] 1932 standard = {} 1933 rest = {} 1934 for k in boolean_attrs: 1935 standard[k] = False 1936 for k, v in data[1:]: 1937 if k is not None: 1938 lc = k.lower() 1939 else: 1940 lc = None 1941 # don't lose case distinction for unknown fields 1942 if (lc in value_attrs) or (lc in boolean_attrs): 1943 k = lc 1944 if k in boolean_attrs: 1945 if v is None: v = True 1946 standard[k] = v 1947 elif k in value_attrs: 1948 standard[k] = v 1949 else: 1950 rest[k] = v 1951 1952 h = standard.get 1953 expires = h("expires") 1954 discard = h("discard") 1955 if expires is not None: 1956 expires = iso2time(expires) 1957 if expires is None: 1958 discard = True 1959 domain = h("domain") 1960 domain_specified = domain.startswith(".") 1961 c = Cookie(h("version"), name, value, 1962 h("port"), h("port_spec"), 1963 domain, domain_specified, h("domain_dot"), 1964 h("path"), h("path_spec"), 1965 h("secure"), 1966 expires, 1967 discard, 1968 h("comment"), 1969 h("commenturl"), 1970 rest) 1971 if not ignore_discard and c.discard: 1972 continue 1973 if not ignore_expires and c.is_expired(now): 1974 continue 1975 self.set_cookie(c) 1976 except OSError: 1977 raise 1978 except Exception: 1979 _warn_unhandled_exception() 1980 raise LoadError("invalid Set-Cookie3 format file %r: %r" % 1981 (filename, line)) 1982 1983 1984class MozillaCookieJar(FileCookieJar): 1985 """ 1986 1987 WARNING: you may want to backup your browser's cookies file if you use 1988 this class to save cookies. I *think* it works, but there have been 1989 bugs in the past! 1990 1991 This class differs from CookieJar only in the format it uses to save and 1992 load cookies to and from a file. This class uses the Mozilla/Netscape 1993 `cookies.txt' format. lynx uses this file format, too. 1994 1995 Don't expect cookies saved while the browser is running to be noticed by 1996 the browser (in fact, Mozilla on unix will overwrite your saved cookies if 1997 you change them on disk while it's running; on Windows, you probably can't 1998 save at all while the browser is running). 1999 2000 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to 2001 Netscape cookies on saving. 2002 2003 In particular, the cookie version and port number information is lost, 2004 together with information about whether or not Path, Port and Discard were 2005 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the 2006 domain as set in the HTTP header started with a dot (yes, I'm aware some 2007 domains in Netscape files start with a dot and some don't -- trust me, you 2008 really don't want to know any more about this). 2009 2010 Note that though Mozilla and Netscape use the same format, they use 2011 slightly different headers. The class saves cookies using the Netscape 2012 header by default (Mozilla can cope with that). 2013 2014 """ 2015 2016 def _really_load(self, f, filename, ignore_discard, ignore_expires): 2017 now = time.time() 2018 2019 if not NETSCAPE_MAGIC_RGX.match(f.readline()): 2020 raise LoadError( 2021 "%r does not look like a Netscape format cookies file" % 2022 filename) 2023 2024 try: 2025 while 1: 2026 line = f.readline() 2027 rest = {} 2028 2029 if line == "": break 2030 2031 # httponly is a cookie flag as defined in rfc6265 2032 # when encoded in a netscape cookie file, 2033 # the line is prepended with "#HttpOnly_" 2034 if line.startswith(HTTPONLY_PREFIX): 2035 rest[HTTPONLY_ATTR] = "" 2036 line = line[len(HTTPONLY_PREFIX):] 2037 2038 # last field may be absent, so keep any trailing tab 2039 if line.endswith("\n"): line = line[:-1] 2040 2041 # skip comments and blank lines XXX what is $ for? 2042 if (line.strip().startswith(("#", "$")) or 2043 line.strip() == ""): 2044 continue 2045 2046 domain, domain_specified, path, secure, expires, name, value = \ 2047 line.split("\t") 2048 secure = (secure == "TRUE") 2049 domain_specified = (domain_specified == "TRUE") 2050 if name == "": 2051 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2052 # with no name, whereas http.cookiejar regards it as a 2053 # cookie with no value. 2054 name = value 2055 value = None 2056 2057 initial_dot = domain.startswith(".") 2058 assert domain_specified == initial_dot 2059 2060 discard = False 2061 if expires == "": 2062 expires = None 2063 discard = True 2064 2065 # assume path_specified is false 2066 c = Cookie(0, name, value, 2067 None, False, 2068 domain, domain_specified, initial_dot, 2069 path, False, 2070 secure, 2071 expires, 2072 discard, 2073 None, 2074 None, 2075 rest) 2076 if not ignore_discard and c.discard: 2077 continue 2078 if not ignore_expires and c.is_expired(now): 2079 continue 2080 self.set_cookie(c) 2081 2082 except OSError: 2083 raise 2084 except Exception: 2085 _warn_unhandled_exception() 2086 raise LoadError("invalid Netscape format cookies file %r: %r" % 2087 (filename, line)) 2088 2089 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 2090 if filename is None: 2091 if self.filename is not None: filename = self.filename 2092 else: raise ValueError(MISSING_FILENAME_TEXT) 2093 2094 with open(filename, "w") as f: 2095 f.write(NETSCAPE_HEADER_TEXT) 2096 now = time.time() 2097 for cookie in self: 2098 domain = cookie.domain 2099 if not ignore_discard and cookie.discard: 2100 continue 2101 if not ignore_expires and cookie.is_expired(now): 2102 continue 2103 if cookie.secure: secure = "TRUE" 2104 else: secure = "FALSE" 2105 if domain.startswith("."): initial_dot = "TRUE" 2106 else: initial_dot = "FALSE" 2107 if cookie.expires is not None: 2108 expires = str(cookie.expires) 2109 else: 2110 expires = "" 2111 if cookie.value is None: 2112 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2113 # with no name, whereas http.cookiejar regards it as a 2114 # cookie with no value. 2115 name = "" 2116 value = cookie.name 2117 else: 2118 name = cookie.name 2119 value = cookie.value 2120 if cookie.has_nonstandard_attr(HTTPONLY_ATTR): 2121 domain = HTTPONLY_PREFIX + domain 2122 f.write( 2123 "\t".join([domain, initial_dot, cookie.path, 2124 secure, expires, name, value])+ 2125 "\n") 2126