1"""RFC 2822 message manipulation. 2 3Note: This is only a very rough sketch of a full RFC-822 parser; in particular 4the tokenizing of addresses does not adhere to all the quoting rules. 5 6Note: RFC 2822 is a long awaited update to RFC 822. This module should 7conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some 8effort at RFC 2822 updates have been made, but a thorough audit has not been 9performed. Consider any RFC 2822 non-conformance to be a bug. 10 11 RFC 2822: http://www.faqs.org/rfcs/rfc2822.html 12 RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete) 13 14Directions for use: 15 16To create a Message object: first open a file, e.g.: 17 18 fp = open(file, 'r') 19 20You can use any other legal way of getting an open file object, e.g. use 21sys.stdin or call os.popen(). Then pass the open file object to the Message() 22constructor: 23 24 m = Message(fp) 25 26This class can work with any input object that supports a readline method. If 27the input object has seek and tell capability, the rewindbody method will 28work; also illegal lines will be pushed back onto the input stream. If the 29input object lacks seek but has an `unread' method that can push back a line 30of input, Message will use that to push back illegal lines. Thus this class 31can be used to parse messages coming from a buffered stream. 32 33The optional `seekable' argument is provided as a workaround for certain stdio 34libraries in which tell() discards buffered data before discovering that the 35lseek() system call doesn't work. For maximum portability, you should set the 36seekable argument to zero to prevent that initial \code{tell} when passing in 37an unseekable object such as a file object created from a socket object. If 38it is 1 on entry -- which it is by default -- the tell() method of the open 39file object is called once; if this raises an exception, seekable is reset to 400. For other nonzero values of seekable, this test is not made. 41 42To get the text of a particular header there are several methods: 43 44 str = m.getheader(name) 45 str = m.getrawheader(name) 46 47where name is the name of the header, e.g. 'Subject'. The difference is that 48getheader() strips the leading and trailing whitespace, while getrawheader() 49doesn't. Both functions retain embedded whitespace (including newlines) 50exactly as they are specified in the header, and leave the case of the text 51unchanged. 52 53For addresses and address lists there are functions 54 55 realname, mailaddress = m.getaddr(name) 56 list = m.getaddrlist(name) 57 58where the latter returns a list of (realname, mailaddr) tuples. 59 60There is also a method 61 62 time = m.getdate(name) 63 64which parses a Date-like field and returns a time-compatible tuple, 65i.e. a tuple such as returned by time.localtime() or accepted by 66time.mktime(). 67 68See the class definition for lower level access methods. 69 70There are also some utility functions here. 71""" 72# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com> 73 74import time 75 76from warnings import warnpy3k 77warnpy3k("in 3.x, rfc822 has been removed in favor of the email package", 78 stacklevel=2) 79 80__all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"] 81 82_blanklines = ('\r\n', '\n') # Optimization for islast() 83 84 85class Message: 86 """Represents a single RFC 2822-compliant message.""" 87 88 def __init__(self, fp, seekable = 1): 89 """Initialize the class instance and read the headers.""" 90 if seekable == 1: 91 # Exercise tell() to make sure it works 92 # (and then assume seek() works, too) 93 try: 94 fp.tell() 95 except (AttributeError, IOError): 96 seekable = 0 97 self.fp = fp 98 self.seekable = seekable 99 self.startofheaders = None 100 self.startofbody = None 101 # 102 if self.seekable: 103 try: 104 self.startofheaders = self.fp.tell() 105 except IOError: 106 self.seekable = 0 107 # 108 self.readheaders() 109 # 110 if self.seekable: 111 try: 112 self.startofbody = self.fp.tell() 113 except IOError: 114 self.seekable = 0 115 116 def rewindbody(self): 117 """Rewind the file to the start of the body (if seekable).""" 118 if not self.seekable: 119 raise IOError, "unseekable file" 120 self.fp.seek(self.startofbody) 121 122 def readheaders(self): 123 """Read header lines. 124 125 Read header lines up to the entirely blank line that terminates them. 126 The (normally blank) line that ends the headers is skipped, but not 127 included in the returned list. If a non-header line ends the headers, 128 (which is an error), an attempt is made to backspace over it; it is 129 never included in the returned list. 130 131 The variable self.status is set to the empty string if all went well, 132 otherwise it is an error message. The variable self.headers is a 133 completely uninterpreted list of lines contained in the header (so 134 printing them will reproduce the header exactly as it appears in the 135 file). 136 """ 137 self.dict = {} 138 self.unixfrom = '' 139 self.headers = lst = [] 140 self.status = '' 141 headerseen = "" 142 firstline = 1 143 startofline = unread = tell = None 144 if hasattr(self.fp, 'unread'): 145 unread = self.fp.unread 146 elif self.seekable: 147 tell = self.fp.tell 148 while 1: 149 if tell: 150 try: 151 startofline = tell() 152 except IOError: 153 startofline = tell = None 154 self.seekable = 0 155 line = self.fp.readline() 156 if not line: 157 self.status = 'EOF in headers' 158 break 159 # Skip unix From name time lines 160 if firstline and line.startswith('From '): 161 self.unixfrom = self.unixfrom + line 162 continue 163 firstline = 0 164 if headerseen and line[0] in ' \t': 165 # It's a continuation line. 166 lst.append(line) 167 x = (self.dict[headerseen] + "\n " + line.strip()) 168 self.dict[headerseen] = x.strip() 169 continue 170 elif self.iscomment(line): 171 # It's a comment. Ignore it. 172 continue 173 elif self.islast(line): 174 # Note! No pushback here! The delimiter line gets eaten. 175 break 176 headerseen = self.isheader(line) 177 if headerseen: 178 # It's a legal header line, save it. 179 lst.append(line) 180 self.dict[headerseen] = line[len(headerseen)+1:].strip() 181 continue 182 elif headerseen is not None: 183 # An empty header name. These aren't allowed in HTTP, but it's 184 # probably a benign mistake. Don't add the header, just keep 185 # going. 186 continue 187 else: 188 # It's not a header line; throw it back and stop here. 189 if not self.dict: 190 self.status = 'No headers' 191 else: 192 self.status = 'Non-header line where header expected' 193 # Try to undo the read. 194 if unread: 195 unread(line) 196 elif tell: 197 self.fp.seek(startofline) 198 else: 199 self.status = self.status + '; bad seek' 200 break 201 202 def isheader(self, line): 203 """Determine whether a given line is a legal header. 204 205 This method should return the header name, suitably canonicalized. 206 You may override this method in order to use Message parsing on tagged 207 data in RFC 2822-like formats with special header formats. 208 """ 209 i = line.find(':') 210 if i > -1: 211 return line[:i].lower() 212 return None 213 214 def islast(self, line): 215 """Determine whether a line is a legal end of RFC 2822 headers. 216 217 You may override this method if your application wants to bend the 218 rules, e.g. to strip trailing whitespace, or to recognize MH template 219 separators ('--------'). For convenience (e.g. for code reading from 220 sockets) a line consisting of \\r\\n also matches. 221 """ 222 return line in _blanklines 223 224 def iscomment(self, line): 225 """Determine whether a line should be skipped entirely. 226 227 You may override this method in order to use Message parsing on tagged 228 data in RFC 2822-like formats that support embedded comments or 229 free-text data. 230 """ 231 return False 232 233 def getallmatchingheaders(self, name): 234 """Find all header lines matching a given header name. 235 236 Look through the list of headers and find all lines matching a given 237 header name (and their continuation lines). A list of the lines is 238 returned, without interpretation. If the header does not occur, an 239 empty list is returned. If the header occurs multiple times, all 240 occurrences are returned. Case is not important in the header name. 241 """ 242 name = name.lower() + ':' 243 n = len(name) 244 lst = [] 245 hit = 0 246 for line in self.headers: 247 if line[:n].lower() == name: 248 hit = 1 249 elif not line[:1].isspace(): 250 hit = 0 251 if hit: 252 lst.append(line) 253 return lst 254 255 def getfirstmatchingheader(self, name): 256 """Get the first header line matching name. 257 258 This is similar to getallmatchingheaders, but it returns only the 259 first matching header (and its continuation lines). 260 """ 261 name = name.lower() + ':' 262 n = len(name) 263 lst = [] 264 hit = 0 265 for line in self.headers: 266 if hit: 267 if not line[:1].isspace(): 268 break 269 elif line[:n].lower() == name: 270 hit = 1 271 if hit: 272 lst.append(line) 273 return lst 274 275 def getrawheader(self, name): 276 """A higher-level interface to getfirstmatchingheader(). 277 278 Return a string containing the literal text of the header but with the 279 keyword stripped. All leading, trailing and embedded whitespace is 280 kept in the string, however. Return None if the header does not 281 occur. 282 """ 283 284 lst = self.getfirstmatchingheader(name) 285 if not lst: 286 return None 287 lst[0] = lst[0][len(name) + 1:] 288 return ''.join(lst) 289 290 def getheader(self, name, default=None): 291 """Get the header value for a name. 292 293 This is the normal interface: it returns a stripped version of the 294 header value for a given header name, or None if it doesn't exist. 295 This uses the dictionary version which finds the *last* such header. 296 """ 297 return self.dict.get(name.lower(), default) 298 get = getheader 299 300 def getheaders(self, name): 301 """Get all values for a header. 302 303 This returns a list of values for headers given more than once; each 304 value in the result list is stripped in the same way as the result of 305 getheader(). If the header is not given, return an empty list. 306 """ 307 result = [] 308 current = '' 309 have_header = 0 310 for s in self.getallmatchingheaders(name): 311 if s[0].isspace(): 312 if current: 313 current = "%s\n %s" % (current, s.strip()) 314 else: 315 current = s.strip() 316 else: 317 if have_header: 318 result.append(current) 319 current = s[s.find(":") + 1:].strip() 320 have_header = 1 321 if have_header: 322 result.append(current) 323 return result 324 325 def getaddr(self, name): 326 """Get a single address from a header, as a tuple. 327 328 An example return value: 329 ('Guido van Rossum', 'guido@cwi.nl') 330 """ 331 # New, by Ben Escoto 332 alist = self.getaddrlist(name) 333 if alist: 334 return alist[0] 335 else: 336 return (None, None) 337 338 def getaddrlist(self, name): 339 """Get a list of addresses from a header. 340 341 Retrieves a list of addresses from a header, where each address is a 342 tuple as returned by getaddr(). Scans all named headers, so it works 343 properly with multiple To: or Cc: headers for example. 344 """ 345 raw = [] 346 for h in self.getallmatchingheaders(name): 347 if h[0] in ' \t': 348 raw.append(h) 349 else: 350 if raw: 351 raw.append(', ') 352 i = h.find(':') 353 if i > 0: 354 addr = h[i+1:] 355 raw.append(addr) 356 alladdrs = ''.join(raw) 357 a = AddressList(alladdrs) 358 return a.addresslist 359 360 def getdate(self, name): 361 """Retrieve a date field from a header. 362 363 Retrieves a date field from the named header, returning a tuple 364 compatible with time.mktime(). 365 """ 366 try: 367 data = self[name] 368 except KeyError: 369 return None 370 return parsedate(data) 371 372 def getdate_tz(self, name): 373 """Retrieve a date field from a header as a 10-tuple. 374 375 The first 9 elements make up a tuple compatible with time.mktime(), 376 and the 10th is the offset of the poster's time zone from GMT/UTC. 377 """ 378 try: 379 data = self[name] 380 except KeyError: 381 return None 382 return parsedate_tz(data) 383 384 385 # Access as a dictionary (only finds *last* header of each type): 386 387 def __len__(self): 388 """Get the number of headers in a message.""" 389 return len(self.dict) 390 391 def __getitem__(self, name): 392 """Get a specific header, as from a dictionary.""" 393 return self.dict[name.lower()] 394 395 def __setitem__(self, name, value): 396 """Set the value of a header. 397 398 Note: This is not a perfect inversion of __getitem__, because any 399 changed headers get stuck at the end of the raw-headers list rather 400 than where the altered header was. 401 """ 402 del self[name] # Won't fail if it doesn't exist 403 self.dict[name.lower()] = value 404 text = name + ": " + value 405 for line in text.split("\n"): 406 self.headers.append(line + "\n") 407 408 def __delitem__(self, name): 409 """Delete all occurrences of a specific header, if it is present.""" 410 name = name.lower() 411 if not name in self.dict: 412 return 413 del self.dict[name] 414 name = name + ':' 415 n = len(name) 416 lst = [] 417 hit = 0 418 for i in range(len(self.headers)): 419 line = self.headers[i] 420 if line[:n].lower() == name: 421 hit = 1 422 elif not line[:1].isspace(): 423 hit = 0 424 if hit: 425 lst.append(i) 426 for i in reversed(lst): 427 del self.headers[i] 428 429 def setdefault(self, name, default=""): 430 lowername = name.lower() 431 if lowername in self.dict: 432 return self.dict[lowername] 433 else: 434 text = name + ": " + default 435 for line in text.split("\n"): 436 self.headers.append(line + "\n") 437 self.dict[lowername] = default 438 return default 439 440 def has_key(self, name): 441 """Determine whether a message contains the named header.""" 442 return name.lower() in self.dict 443 444 def __contains__(self, name): 445 """Determine whether a message contains the named header.""" 446 return name.lower() in self.dict 447 448 def __iter__(self): 449 return iter(self.dict) 450 451 def keys(self): 452 """Get all of a message's header field names.""" 453 return self.dict.keys() 454 455 def values(self): 456 """Get all of a message's header field values.""" 457 return self.dict.values() 458 459 def items(self): 460 """Get all of a message's headers. 461 462 Returns a list of name, value tuples. 463 """ 464 return self.dict.items() 465 466 def __str__(self): 467 return ''.join(self.headers) 468 469 470# Utility functions 471# ----------------- 472 473# XXX Should fix unquote() and quote() to be really conformant. 474# XXX The inverses of the parse functions may also be useful. 475 476 477def unquote(s): 478 """Remove quotes from a string.""" 479 if len(s) > 1: 480 if s.startswith('"') and s.endswith('"'): 481 return s[1:-1].replace('\\\\', '\\').replace('\\"', '"') 482 if s.startswith('<') and s.endswith('>'): 483 return s[1:-1] 484 return s 485 486 487def quote(s): 488 """Add quotes around a string.""" 489 return s.replace('\\', '\\\\').replace('"', '\\"') 490 491 492def parseaddr(address): 493 """Parse an address into a (realname, mailaddr) tuple.""" 494 a = AddressList(address) 495 lst = a.addresslist 496 if not lst: 497 return (None, None) 498 return lst[0] 499 500 501class AddrlistClass: 502 """Address parser class by Ben Escoto. 503 504 To understand what this class does, it helps to have a copy of 505 RFC 2822 in front of you. 506 507 http://www.faqs.org/rfcs/rfc2822.html 508 509 Note: this class interface is deprecated and may be removed in the future. 510 Use rfc822.AddressList instead. 511 """ 512 513 def __init__(self, field): 514 """Initialize a new instance. 515 516 `field' is an unparsed address header field, containing one or more 517 addresses. 518 """ 519 self.specials = '()<>@,:;.\"[]' 520 self.pos = 0 521 self.LWS = ' \t' 522 self.CR = '\r\n' 523 self.atomends = self.specials + self.LWS + self.CR 524 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 525 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 526 # syntax, so allow dots in phrases. 527 self.phraseends = self.atomends.replace('.', '') 528 self.field = field 529 self.commentlist = [] 530 531 def gotonext(self): 532 """Parse up to the start of the next address.""" 533 while self.pos < len(self.field): 534 if self.field[self.pos] in self.LWS + '\n\r': 535 self.pos = self.pos + 1 536 elif self.field[self.pos] == '(': 537 self.commentlist.append(self.getcomment()) 538 else: break 539 540 def getaddrlist(self): 541 """Parse all addresses. 542 543 Returns a list containing all of the addresses. 544 """ 545 result = [] 546 ad = self.getaddress() 547 while ad: 548 result += ad 549 ad = self.getaddress() 550 return result 551 552 def getaddress(self): 553 """Parse the next address.""" 554 self.commentlist = [] 555 self.gotonext() 556 557 oldpos = self.pos 558 oldcl = self.commentlist 559 plist = self.getphraselist() 560 561 self.gotonext() 562 returnlist = [] 563 564 if self.pos >= len(self.field): 565 # Bad email address technically, no domain. 566 if plist: 567 returnlist = [(' '.join(self.commentlist), plist[0])] 568 569 elif self.field[self.pos] in '.@': 570 # email address is just an addrspec 571 # this isn't very efficient since we start over 572 self.pos = oldpos 573 self.commentlist = oldcl 574 addrspec = self.getaddrspec() 575 returnlist = [(' '.join(self.commentlist), addrspec)] 576 577 elif self.field[self.pos] == ':': 578 # address is a group 579 returnlist = [] 580 581 fieldlen = len(self.field) 582 self.pos += 1 583 while self.pos < len(self.field): 584 self.gotonext() 585 if self.pos < fieldlen and self.field[self.pos] == ';': 586 self.pos += 1 587 break 588 returnlist = returnlist + self.getaddress() 589 590 elif self.field[self.pos] == '<': 591 # Address is a phrase then a route addr 592 routeaddr = self.getrouteaddr() 593 594 if self.commentlist: 595 returnlist = [(' '.join(plist) + ' (' + \ 596 ' '.join(self.commentlist) + ')', routeaddr)] 597 else: returnlist = [(' '.join(plist), routeaddr)] 598 599 else: 600 if plist: 601 returnlist = [(' '.join(self.commentlist), plist[0])] 602 elif self.field[self.pos] in self.specials: 603 self.pos += 1 604 605 self.gotonext() 606 if self.pos < len(self.field) and self.field[self.pos] == ',': 607 self.pos += 1 608 return returnlist 609 610 def getrouteaddr(self): 611 """Parse a route address (Return-path value). 612 613 This method just skips all the route stuff and returns the addrspec. 614 """ 615 if self.field[self.pos] != '<': 616 return 617 618 expectroute = 0 619 self.pos += 1 620 self.gotonext() 621 adlist = "" 622 while self.pos < len(self.field): 623 if expectroute: 624 self.getdomain() 625 expectroute = 0 626 elif self.field[self.pos] == '>': 627 self.pos += 1 628 break 629 elif self.field[self.pos] == '@': 630 self.pos += 1 631 expectroute = 1 632 elif self.field[self.pos] == ':': 633 self.pos += 1 634 else: 635 adlist = self.getaddrspec() 636 self.pos += 1 637 break 638 self.gotonext() 639 640 return adlist 641 642 def getaddrspec(self): 643 """Parse an RFC 2822 addr-spec.""" 644 aslist = [] 645 646 self.gotonext() 647 while self.pos < len(self.field): 648 if self.field[self.pos] == '.': 649 aslist.append('.') 650 self.pos += 1 651 elif self.field[self.pos] == '"': 652 aslist.append('"%s"' % self.getquote()) 653 elif self.field[self.pos] in self.atomends: 654 break 655 else: aslist.append(self.getatom()) 656 self.gotonext() 657 658 if self.pos >= len(self.field) or self.field[self.pos] != '@': 659 return ''.join(aslist) 660 661 aslist.append('@') 662 self.pos += 1 663 self.gotonext() 664 return ''.join(aslist) + self.getdomain() 665 666 def getdomain(self): 667 """Get the complete domain name from an address.""" 668 sdlist = [] 669 while self.pos < len(self.field): 670 if self.field[self.pos] in self.LWS: 671 self.pos += 1 672 elif self.field[self.pos] == '(': 673 self.commentlist.append(self.getcomment()) 674 elif self.field[self.pos] == '[': 675 sdlist.append(self.getdomainliteral()) 676 elif self.field[self.pos] == '.': 677 self.pos += 1 678 sdlist.append('.') 679 elif self.field[self.pos] in self.atomends: 680 break 681 else: sdlist.append(self.getatom()) 682 return ''.join(sdlist) 683 684 def getdelimited(self, beginchar, endchars, allowcomments = 1): 685 """Parse a header fragment delimited by special characters. 686 687 `beginchar' is the start character for the fragment. If self is not 688 looking at an instance of `beginchar' then getdelimited returns the 689 empty string. 690 691 `endchars' is a sequence of allowable end-delimiting characters. 692 Parsing stops when one of these is encountered. 693 694 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 695 within the parsed fragment. 696 """ 697 if self.field[self.pos] != beginchar: 698 return '' 699 700 slist = [''] 701 quote = 0 702 self.pos += 1 703 while self.pos < len(self.field): 704 if quote == 1: 705 slist.append(self.field[self.pos]) 706 quote = 0 707 elif self.field[self.pos] in endchars: 708 self.pos += 1 709 break 710 elif allowcomments and self.field[self.pos] == '(': 711 slist.append(self.getcomment()) 712 continue # have already advanced pos from getcomment 713 elif self.field[self.pos] == '\\': 714 quote = 1 715 else: 716 slist.append(self.field[self.pos]) 717 self.pos += 1 718 719 return ''.join(slist) 720 721 def getquote(self): 722 """Get a quote-delimited fragment from self's field.""" 723 return self.getdelimited('"', '"\r', 0) 724 725 def getcomment(self): 726 """Get a parenthesis-delimited fragment from self's field.""" 727 return self.getdelimited('(', ')\r', 1) 728 729 def getdomainliteral(self): 730 """Parse an RFC 2822 domain-literal.""" 731 return '[%s]' % self.getdelimited('[', ']\r', 0) 732 733 def getatom(self, atomends=None): 734 """Parse an RFC 2822 atom. 735 736 Optional atomends specifies a different set of end token delimiters 737 (the default is to use self.atomends). This is used e.g. in 738 getphraselist() since phrase endings must not include the `.' (which 739 is legal in phrases).""" 740 atomlist = [''] 741 if atomends is None: 742 atomends = self.atomends 743 744 while self.pos < len(self.field): 745 if self.field[self.pos] in atomends: 746 break 747 else: atomlist.append(self.field[self.pos]) 748 self.pos += 1 749 750 return ''.join(atomlist) 751 752 def getphraselist(self): 753 """Parse a sequence of RFC 2822 phrases. 754 755 A phrase is a sequence of words, which are in turn either RFC 2822 756 atoms or quoted-strings. Phrases are canonicalized by squeezing all 757 runs of continuous whitespace into one space. 758 """ 759 plist = [] 760 761 while self.pos < len(self.field): 762 if self.field[self.pos] in self.LWS: 763 self.pos += 1 764 elif self.field[self.pos] == '"': 765 plist.append(self.getquote()) 766 elif self.field[self.pos] == '(': 767 self.commentlist.append(self.getcomment()) 768 elif self.field[self.pos] in self.phraseends: 769 break 770 else: 771 plist.append(self.getatom(self.phraseends)) 772 773 return plist 774 775class AddressList(AddrlistClass): 776 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 777 def __init__(self, field): 778 AddrlistClass.__init__(self, field) 779 if field: 780 self.addresslist = self.getaddrlist() 781 else: 782 self.addresslist = [] 783 784 def __len__(self): 785 return len(self.addresslist) 786 787 def __str__(self): 788 return ", ".join(map(dump_address_pair, self.addresslist)) 789 790 def __add__(self, other): 791 # Set union 792 newaddr = AddressList(None) 793 newaddr.addresslist = self.addresslist[:] 794 for x in other.addresslist: 795 if not x in self.addresslist: 796 newaddr.addresslist.append(x) 797 return newaddr 798 799 def __iadd__(self, other): 800 # Set union, in-place 801 for x in other.addresslist: 802 if not x in self.addresslist: 803 self.addresslist.append(x) 804 return self 805 806 def __sub__(self, other): 807 # Set difference 808 newaddr = AddressList(None) 809 for x in self.addresslist: 810 if not x in other.addresslist: 811 newaddr.addresslist.append(x) 812 return newaddr 813 814 def __isub__(self, other): 815 # Set difference, in-place 816 for x in other.addresslist: 817 if x in self.addresslist: 818 self.addresslist.remove(x) 819 return self 820 821 def __getitem__(self, index): 822 # Make indexing, slices, and 'in' work 823 return self.addresslist[index] 824 825def dump_address_pair(pair): 826 """Dump a (name, address) pair in a canonicalized form.""" 827 if pair[0]: 828 return '"' + pair[0] + '" <' + pair[1] + '>' 829 else: 830 return pair[1] 831 832# Parse a date field 833 834_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 835 'aug', 'sep', 'oct', 'nov', 'dec', 836 'january', 'february', 'march', 'april', 'may', 'june', 'july', 837 'august', 'september', 'october', 'november', 'december'] 838_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 839 840# The timezone table does not include the military time zones defined 841# in RFC822, other than Z. According to RFC1123, the description in 842# RFC822 gets the signs wrong, so we can't rely on any such time 843# zones. RFC1123 recommends that numeric timezone indicators be used 844# instead of timezone names. 845 846_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 847 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 848 'EST': -500, 'EDT': -400, # Eastern 849 'CST': -600, 'CDT': -500, # Central 850 'MST': -700, 'MDT': -600, # Mountain 851 'PST': -800, 'PDT': -700 # Pacific 852 } 853 854 855def parsedate_tz(data): 856 """Convert a date string to a time tuple. 857 858 Accounts for military timezones. 859 """ 860 if not data: 861 return None 862 data = data.split() 863 if data[0][-1] in (',', '.') or data[0].lower() in _daynames: 864 # There's a dayname here. Skip it 865 del data[0] 866 else: 867 # no space after the "weekday,"? 868 i = data[0].rfind(',') 869 if i >= 0: 870 data[0] = data[0][i+1:] 871 if len(data) == 3: # RFC 850 date, deprecated 872 stuff = data[0].split('-') 873 if len(stuff) == 3: 874 data = stuff + data[1:] 875 if len(data) == 4: 876 s = data[3] 877 i = s.find('+') 878 if i > 0: 879 data[3:] = [s[:i], s[i+1:]] 880 else: 881 data.append('') # Dummy tz 882 if len(data) < 5: 883 return None 884 data = data[:5] 885 [dd, mm, yy, tm, tz] = data 886 mm = mm.lower() 887 if not mm in _monthnames: 888 dd, mm = mm, dd.lower() 889 if not mm in _monthnames: 890 return None 891 mm = _monthnames.index(mm)+1 892 if mm > 12: mm = mm - 12 893 if dd[-1] == ',': 894 dd = dd[:-1] 895 i = yy.find(':') 896 if i > 0: 897 yy, tm = tm, yy 898 if yy[-1] == ',': 899 yy = yy[:-1] 900 if not yy[0].isdigit(): 901 yy, tz = tz, yy 902 if tm[-1] == ',': 903 tm = tm[:-1] 904 tm = tm.split(':') 905 if len(tm) == 2: 906 [thh, tmm] = tm 907 tss = '0' 908 elif len(tm) == 3: 909 [thh, tmm, tss] = tm 910 else: 911 return None 912 try: 913 yy = int(yy) 914 dd = int(dd) 915 thh = int(thh) 916 tmm = int(tmm) 917 tss = int(tss) 918 except ValueError: 919 return None 920 tzoffset = None 921 tz = tz.upper() 922 if tz in _timezones: 923 tzoffset = _timezones[tz] 924 else: 925 try: 926 tzoffset = int(tz) 927 except ValueError: 928 pass 929 # Convert a timezone offset into seconds ; -0500 -> -18000 930 if tzoffset: 931 if tzoffset < 0: 932 tzsign = -1 933 tzoffset = -tzoffset 934 else: 935 tzsign = 1 936 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 937 return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset) 938 939 940def parsedate(data): 941 """Convert a time string to a time tuple.""" 942 t = parsedate_tz(data) 943 if t is None: 944 return t 945 return t[:9] 946 947 948def mktime_tz(data): 949 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp.""" 950 if data[9] is None: 951 # No zone info, so localtime is better assumption than GMT 952 return time.mktime(data[:8] + (-1,)) 953 else: 954 t = time.mktime(data[:8] + (0,)) 955 return t - data[9] - time.timezone 956 957def formatdate(timeval=None): 958 """Returns time format preferred for Internet standards. 959 960 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 961 962 According to RFC 1123, day and month names must always be in 963 English. If not for that, this code could use strftime(). It 964 can't because strftime() honors the locale and could generate 965 non-English names. 966 """ 967 if timeval is None: 968 timeval = time.time() 969 timeval = time.gmtime(timeval) 970 return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( 971 ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]], 972 timeval[2], 973 ("Jan", "Feb", "Mar", "Apr", "May", "Jun", 974 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1], 975 timeval[0], timeval[3], timeval[4], timeval[5]) 976 977 978# When used as script, run a small test program. 979# The first command line argument must be a filename containing one 980# message in RFC-822 format. 981 982if __name__ == '__main__': 983 import sys, os 984 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1') 985 if sys.argv[1:]: file = sys.argv[1] 986 f = open(file, 'r') 987 m = Message(f) 988 print 'From:', m.getaddr('from') 989 print 'To:', m.getaddrlist('to') 990 print 'Subject:', m.getheader('subject') 991 print 'Date:', m.getheader('date') 992 date = m.getdate_tz('date') 993 tz = date[-1] 994 date = time.localtime(mktime_tz(date)) 995 if date: 996 print 'ParsedDate:', time.asctime(date), 997 hhmmss = tz 998 hhmm, ss = divmod(hhmmss, 60) 999 hh, mm = divmod(hhmm, 60) 1000 print "%+03d%02d" % (hh, mm), 1001 if ss: print ".%02d" % ss, 1002 print 1003 else: 1004 print 'ParsedDate:', None 1005 m.rewindbody() 1006 n = 0 1007 while f.readline(): 1008 n += 1 1009 print 'Lines:', n 1010 print '-'*70 1011 print 'len =', len(m) 1012 if 'Date' in m: print 'Date =', m['Date'] 1013 if 'X-Nonsense' in m: pass 1014 print 'keys =', m.keys() 1015 print 'values =', m.values() 1016 print 'items =', m.items() 1017