• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""RFC 2822 message manipulation.
2
3Note: This is only a very rough sketch of a full RFC-822 parser; in particular
4the tokenizing of addresses does not adhere to all the quoting rules.
5
6Note: RFC 2822 is a long awaited update to RFC 822.  This module should
7conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
8effort at RFC 2822 updates have been made, but a thorough audit has not been
9performed.  Consider any RFC 2822 non-conformance to be a bug.
10
11    RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12    RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
13
14Directions for use:
15
16To create a Message object: first open a file, e.g.:
17
18  fp = open(file, 'r')
19
20You can use any other legal way of getting an open file object, e.g. use
21sys.stdin or call os.popen().  Then pass the open file object to the Message()
22constructor:
23
24  m = Message(fp)
25
26This class can work with any input object that supports a readline method.  If
27the input object has seek and tell capability, the rewindbody method will
28work; also illegal lines will be pushed back onto the input stream.  If the
29input object lacks seek but has an `unread' method that can push back a line
30of input, Message will use that to push back illegal lines.  Thus this class
31can be used to parse messages coming from a buffered stream.
32
33The optional `seekable' argument is provided as a workaround for certain stdio
34libraries in which tell() discards buffered data before discovering that the
35lseek() system call doesn't work.  For maximum portability, you should set the
36seekable argument to zero to prevent that initial \code{tell} when passing in
37an unseekable object such as a file object created from a socket object.  If
38it is 1 on entry -- which it is by default -- the tell() method of the open
39file object is called once; if this raises an exception, seekable is reset to
400.  For other nonzero values of seekable, this test is not made.
41
42To get the text of a particular header there are several methods:
43
44  str = m.getheader(name)
45  str = m.getrawheader(name)
46
47where name is the name of the header, e.g. 'Subject'.  The difference is that
48getheader() strips the leading and trailing whitespace, while getrawheader()
49doesn't.  Both functions retain embedded whitespace (including newlines)
50exactly as they are specified in the header, and leave the case of the text
51unchanged.
52
53For addresses and address lists there are functions
54
55  realname, mailaddress = m.getaddr(name)
56  list = m.getaddrlist(name)
57
58where the latter returns a list of (realname, mailaddr) tuples.
59
60There is also a method
61
62  time = m.getdate(name)
63
64which parses a Date-like field and returns a time-compatible tuple,
65i.e. a tuple such as returned by time.localtime() or accepted by
66time.mktime().
67
68See the class definition for lower level access methods.
69
70There are also some utility functions here.
71"""
72# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
73
74import time
75
76from warnings import warnpy3k
77warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
78         stacklevel=2)
79
80__all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
81
82_blanklines = ('\r\n', '\n')            # Optimization for islast()
83
84
85class Message:
86    """Represents a single RFC 2822-compliant message."""
87
88    def __init__(self, fp, seekable = 1):
89        """Initialize the class instance and read the headers."""
90        if seekable == 1:
91            # Exercise tell() to make sure it works
92            # (and then assume seek() works, too)
93            try:
94                fp.tell()
95            except (AttributeError, IOError):
96                seekable = 0
97        self.fp = fp
98        self.seekable = seekable
99        self.startofheaders = None
100        self.startofbody = None
101        #
102        if self.seekable:
103            try:
104                self.startofheaders = self.fp.tell()
105            except IOError:
106                self.seekable = 0
107        #
108        self.readheaders()
109        #
110        if self.seekable:
111            try:
112                self.startofbody = self.fp.tell()
113            except IOError:
114                self.seekable = 0
115
116    def rewindbody(self):
117        """Rewind the file to the start of the body (if seekable)."""
118        if not self.seekable:
119            raise IOError, "unseekable file"
120        self.fp.seek(self.startofbody)
121
122    def readheaders(self):
123        """Read header lines.
124
125        Read header lines up to the entirely blank line that terminates them.
126        The (normally blank) line that ends the headers is skipped, but not
127        included in the returned list.  If a non-header line ends the headers,
128        (which is an error), an attempt is made to backspace over it; it is
129        never included in the returned list.
130
131        The variable self.status is set to the empty string if all went well,
132        otherwise it is an error message.  The variable self.headers is a
133        completely uninterpreted list of lines contained in the header (so
134        printing them will reproduce the header exactly as it appears in the
135        file).
136        """
137        self.dict = {}
138        self.unixfrom = ''
139        self.headers = lst = []
140        self.status = ''
141        headerseen = ""
142        firstline = 1
143        startofline = unread = tell = None
144        if hasattr(self.fp, 'unread'):
145            unread = self.fp.unread
146        elif self.seekable:
147            tell = self.fp.tell
148        while 1:
149            if tell:
150                try:
151                    startofline = tell()
152                except IOError:
153                    startofline = tell = None
154                    self.seekable = 0
155            line = self.fp.readline()
156            if not line:
157                self.status = 'EOF in headers'
158                break
159            # Skip unix From name time lines
160            if firstline and line.startswith('From '):
161                self.unixfrom = self.unixfrom + line
162                continue
163            firstline = 0
164            if headerseen and line[0] in ' \t':
165                # It's a continuation line.
166                lst.append(line)
167                x = (self.dict[headerseen] + "\n " + line.strip())
168                self.dict[headerseen] = x.strip()
169                continue
170            elif self.iscomment(line):
171                # It's a comment.  Ignore it.
172                continue
173            elif self.islast(line):
174                # Note! No pushback here!  The delimiter line gets eaten.
175                break
176            headerseen = self.isheader(line)
177            if headerseen:
178                # It's a legal header line, save it.
179                lst.append(line)
180                self.dict[headerseen] = line[len(headerseen)+1:].strip()
181                continue
182            elif headerseen is not None:
183                # An empty header name. These aren't allowed in HTTP, but it's
184                # probably a benign mistake. Don't add the header, just keep
185                # going.
186                continue
187            else:
188                # It's not a header line; throw it back and stop here.
189                if not self.dict:
190                    self.status = 'No headers'
191                else:
192                    self.status = 'Non-header line where header expected'
193                # Try to undo the read.
194                if unread:
195                    unread(line)
196                elif tell:
197                    self.fp.seek(startofline)
198                else:
199                    self.status = self.status + '; bad seek'
200                break
201
202    def isheader(self, line):
203        """Determine whether a given line is a legal header.
204
205        This method should return the header name, suitably canonicalized.
206        You may override this method in order to use Message parsing on tagged
207        data in RFC 2822-like formats with special header formats.
208        """
209        i = line.find(':')
210        if i > -1:
211            return line[:i].lower()
212        return None
213
214    def islast(self, line):
215        """Determine whether a line is a legal end of RFC 2822 headers.
216
217        You may override this method if your application wants to bend the
218        rules, e.g. to strip trailing whitespace, or to recognize MH template
219        separators ('--------').  For convenience (e.g. for code reading from
220        sockets) a line consisting of \\r\\n also matches.
221        """
222        return line in _blanklines
223
224    def iscomment(self, line):
225        """Determine whether a line should be skipped entirely.
226
227        You may override this method in order to use Message parsing on tagged
228        data in RFC 2822-like formats that support embedded comments or
229        free-text data.
230        """
231        return False
232
233    def getallmatchingheaders(self, name):
234        """Find all header lines matching a given header name.
235
236        Look through the list of headers and find all lines matching a given
237        header name (and their continuation lines).  A list of the lines is
238        returned, without interpretation.  If the header does not occur, an
239        empty list is returned.  If the header occurs multiple times, all
240        occurrences are returned.  Case is not important in the header name.
241        """
242        name = name.lower() + ':'
243        n = len(name)
244        lst = []
245        hit = 0
246        for line in self.headers:
247            if line[:n].lower() == name:
248                hit = 1
249            elif not line[:1].isspace():
250                hit = 0
251            if hit:
252                lst.append(line)
253        return lst
254
255    def getfirstmatchingheader(self, name):
256        """Get the first header line matching name.
257
258        This is similar to getallmatchingheaders, but it returns only the
259        first matching header (and its continuation lines).
260        """
261        name = name.lower() + ':'
262        n = len(name)
263        lst = []
264        hit = 0
265        for line in self.headers:
266            if hit:
267                if not line[:1].isspace():
268                    break
269            elif line[:n].lower() == name:
270                hit = 1
271            if hit:
272                lst.append(line)
273        return lst
274
275    def getrawheader(self, name):
276        """A higher-level interface to getfirstmatchingheader().
277
278        Return a string containing the literal text of the header but with the
279        keyword stripped.  All leading, trailing and embedded whitespace is
280        kept in the string, however.  Return None if the header does not
281        occur.
282        """
283
284        lst = self.getfirstmatchingheader(name)
285        if not lst:
286            return None
287        lst[0] = lst[0][len(name) + 1:]
288        return ''.join(lst)
289
290    def getheader(self, name, default=None):
291        """Get the header value for a name.
292
293        This is the normal interface: it returns a stripped version of the
294        header value for a given header name, or None if it doesn't exist.
295        This uses the dictionary version which finds the *last* such header.
296        """
297        return self.dict.get(name.lower(), default)
298    get = getheader
299
300    def getheaders(self, name):
301        """Get all values for a header.
302
303        This returns a list of values for headers given more than once; each
304        value in the result list is stripped in the same way as the result of
305        getheader().  If the header is not given, return an empty list.
306        """
307        result = []
308        current = ''
309        have_header = 0
310        for s in self.getallmatchingheaders(name):
311            if s[0].isspace():
312                if current:
313                    current = "%s\n %s" % (current, s.strip())
314                else:
315                    current = s.strip()
316            else:
317                if have_header:
318                    result.append(current)
319                current = s[s.find(":") + 1:].strip()
320                have_header = 1
321        if have_header:
322            result.append(current)
323        return result
324
325    def getaddr(self, name):
326        """Get a single address from a header, as a tuple.
327
328        An example return value:
329        ('Guido van Rossum', 'guido@cwi.nl')
330        """
331        # New, by Ben Escoto
332        alist = self.getaddrlist(name)
333        if alist:
334            return alist[0]
335        else:
336            return (None, None)
337
338    def getaddrlist(self, name):
339        """Get a list of addresses from a header.
340
341        Retrieves a list of addresses from a header, where each address is a
342        tuple as returned by getaddr().  Scans all named headers, so it works
343        properly with multiple To: or Cc: headers for example.
344        """
345        raw = []
346        for h in self.getallmatchingheaders(name):
347            if h[0] in ' \t':
348                raw.append(h)
349            else:
350                if raw:
351                    raw.append(', ')
352                i = h.find(':')
353                if i > 0:
354                    addr = h[i+1:]
355                raw.append(addr)
356        alladdrs = ''.join(raw)
357        a = AddressList(alladdrs)
358        return a.addresslist
359
360    def getdate(self, name):
361        """Retrieve a date field from a header.
362
363        Retrieves a date field from the named header, returning a tuple
364        compatible with time.mktime().
365        """
366        try:
367            data = self[name]
368        except KeyError:
369            return None
370        return parsedate(data)
371
372    def getdate_tz(self, name):
373        """Retrieve a date field from a header as a 10-tuple.
374
375        The first 9 elements make up a tuple compatible with time.mktime(),
376        and the 10th is the offset of the poster's time zone from GMT/UTC.
377        """
378        try:
379            data = self[name]
380        except KeyError:
381            return None
382        return parsedate_tz(data)
383
384
385    # Access as a dictionary (only finds *last* header of each type):
386
387    def __len__(self):
388        """Get the number of headers in a message."""
389        return len(self.dict)
390
391    def __getitem__(self, name):
392        """Get a specific header, as from a dictionary."""
393        return self.dict[name.lower()]
394
395    def __setitem__(self, name, value):
396        """Set the value of a header.
397
398        Note: This is not a perfect inversion of __getitem__, because any
399        changed headers get stuck at the end of the raw-headers list rather
400        than where the altered header was.
401        """
402        del self[name] # Won't fail if it doesn't exist
403        self.dict[name.lower()] = value
404        text = name + ": " + value
405        for line in text.split("\n"):
406            self.headers.append(line + "\n")
407
408    def __delitem__(self, name):
409        """Delete all occurrences of a specific header, if it is present."""
410        name = name.lower()
411        if not name in self.dict:
412            return
413        del self.dict[name]
414        name = name + ':'
415        n = len(name)
416        lst = []
417        hit = 0
418        for i in range(len(self.headers)):
419            line = self.headers[i]
420            if line[:n].lower() == name:
421                hit = 1
422            elif not line[:1].isspace():
423                hit = 0
424            if hit:
425                lst.append(i)
426        for i in reversed(lst):
427            del self.headers[i]
428
429    def setdefault(self, name, default=""):
430        lowername = name.lower()
431        if lowername in self.dict:
432            return self.dict[lowername]
433        else:
434            text = name + ": " + default
435            for line in text.split("\n"):
436                self.headers.append(line + "\n")
437            self.dict[lowername] = default
438            return default
439
440    def has_key(self, name):
441        """Determine whether a message contains the named header."""
442        return name.lower() in self.dict
443
444    def __contains__(self, name):
445        """Determine whether a message contains the named header."""
446        return name.lower() in self.dict
447
448    def __iter__(self):
449        return iter(self.dict)
450
451    def keys(self):
452        """Get all of a message's header field names."""
453        return self.dict.keys()
454
455    def values(self):
456        """Get all of a message's header field values."""
457        return self.dict.values()
458
459    def items(self):
460        """Get all of a message's headers.
461
462        Returns a list of name, value tuples.
463        """
464        return self.dict.items()
465
466    def __str__(self):
467        return ''.join(self.headers)
468
469
470# Utility functions
471# -----------------
472
473# XXX Should fix unquote() and quote() to be really conformant.
474# XXX The inverses of the parse functions may also be useful.
475
476
477def unquote(s):
478    """Remove quotes from a string."""
479    if len(s) > 1:
480        if s.startswith('"') and s.endswith('"'):
481            return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
482        if s.startswith('<') and s.endswith('>'):
483            return s[1:-1]
484    return s
485
486
487def quote(s):
488    """Add quotes around a string."""
489    return s.replace('\\', '\\\\').replace('"', '\\"')
490
491
492def parseaddr(address):
493    """Parse an address into a (realname, mailaddr) tuple."""
494    a = AddressList(address)
495    lst = a.addresslist
496    if not lst:
497        return (None, None)
498    return lst[0]
499
500
501class AddrlistClass:
502    """Address parser class by Ben Escoto.
503
504    To understand what this class does, it helps to have a copy of
505    RFC 2822 in front of you.
506
507    http://www.faqs.org/rfcs/rfc2822.html
508
509    Note: this class interface is deprecated and may be removed in the future.
510    Use rfc822.AddressList instead.
511    """
512
513    def __init__(self, field):
514        """Initialize a new instance.
515
516        `field' is an unparsed address header field, containing one or more
517        addresses.
518        """
519        self.specials = '()<>@,:;.\"[]'
520        self.pos = 0
521        self.LWS = ' \t'
522        self.CR = '\r\n'
523        self.atomends = self.specials + self.LWS + self.CR
524        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
525        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
526        # syntax, so allow dots in phrases.
527        self.phraseends = self.atomends.replace('.', '')
528        self.field = field
529        self.commentlist = []
530
531    def gotonext(self):
532        """Parse up to the start of the next address."""
533        while self.pos < len(self.field):
534            if self.field[self.pos] in self.LWS + '\n\r':
535                self.pos = self.pos + 1
536            elif self.field[self.pos] == '(':
537                self.commentlist.append(self.getcomment())
538            else: break
539
540    def getaddrlist(self):
541        """Parse all addresses.
542
543        Returns a list containing all of the addresses.
544        """
545        result = []
546        ad = self.getaddress()
547        while ad:
548            result += ad
549            ad = self.getaddress()
550        return result
551
552    def getaddress(self):
553        """Parse the next address."""
554        self.commentlist = []
555        self.gotonext()
556
557        oldpos = self.pos
558        oldcl = self.commentlist
559        plist = self.getphraselist()
560
561        self.gotonext()
562        returnlist = []
563
564        if self.pos >= len(self.field):
565            # Bad email address technically, no domain.
566            if plist:
567                returnlist = [(' '.join(self.commentlist), plist[0])]
568
569        elif self.field[self.pos] in '.@':
570            # email address is just an addrspec
571            # this isn't very efficient since we start over
572            self.pos = oldpos
573            self.commentlist = oldcl
574            addrspec = self.getaddrspec()
575            returnlist = [(' '.join(self.commentlist), addrspec)]
576
577        elif self.field[self.pos] == ':':
578            # address is a group
579            returnlist = []
580
581            fieldlen = len(self.field)
582            self.pos += 1
583            while self.pos < len(self.field):
584                self.gotonext()
585                if self.pos < fieldlen and self.field[self.pos] == ';':
586                    self.pos += 1
587                    break
588                returnlist = returnlist + self.getaddress()
589
590        elif self.field[self.pos] == '<':
591            # Address is a phrase then a route addr
592            routeaddr = self.getrouteaddr()
593
594            if self.commentlist:
595                returnlist = [(' '.join(plist) + ' (' + \
596                         ' '.join(self.commentlist) + ')', routeaddr)]
597            else: returnlist = [(' '.join(plist), routeaddr)]
598
599        else:
600            if plist:
601                returnlist = [(' '.join(self.commentlist), plist[0])]
602            elif self.field[self.pos] in self.specials:
603                self.pos += 1
604
605        self.gotonext()
606        if self.pos < len(self.field) and self.field[self.pos] == ',':
607            self.pos += 1
608        return returnlist
609
610    def getrouteaddr(self):
611        """Parse a route address (Return-path value).
612
613        This method just skips all the route stuff and returns the addrspec.
614        """
615        if self.field[self.pos] != '<':
616            return
617
618        expectroute = 0
619        self.pos += 1
620        self.gotonext()
621        adlist = ""
622        while self.pos < len(self.field):
623            if expectroute:
624                self.getdomain()
625                expectroute = 0
626            elif self.field[self.pos] == '>':
627                self.pos += 1
628                break
629            elif self.field[self.pos] == '@':
630                self.pos += 1
631                expectroute = 1
632            elif self.field[self.pos] == ':':
633                self.pos += 1
634            else:
635                adlist = self.getaddrspec()
636                self.pos += 1
637                break
638            self.gotonext()
639
640        return adlist
641
642    def getaddrspec(self):
643        """Parse an RFC 2822 addr-spec."""
644        aslist = []
645
646        self.gotonext()
647        while self.pos < len(self.field):
648            if self.field[self.pos] == '.':
649                aslist.append('.')
650                self.pos += 1
651            elif self.field[self.pos] == '"':
652                aslist.append('"%s"' % self.getquote())
653            elif self.field[self.pos] in self.atomends:
654                break
655            else: aslist.append(self.getatom())
656            self.gotonext()
657
658        if self.pos >= len(self.field) or self.field[self.pos] != '@':
659            return ''.join(aslist)
660
661        aslist.append('@')
662        self.pos += 1
663        self.gotonext()
664        return ''.join(aslist) + self.getdomain()
665
666    def getdomain(self):
667        """Get the complete domain name from an address."""
668        sdlist = []
669        while self.pos < len(self.field):
670            if self.field[self.pos] in self.LWS:
671                self.pos += 1
672            elif self.field[self.pos] == '(':
673                self.commentlist.append(self.getcomment())
674            elif self.field[self.pos] == '[':
675                sdlist.append(self.getdomainliteral())
676            elif self.field[self.pos] == '.':
677                self.pos += 1
678                sdlist.append('.')
679            elif self.field[self.pos] in self.atomends:
680                break
681            else: sdlist.append(self.getatom())
682        return ''.join(sdlist)
683
684    def getdelimited(self, beginchar, endchars, allowcomments = 1):
685        """Parse a header fragment delimited by special characters.
686
687        `beginchar' is the start character for the fragment.  If self is not
688        looking at an instance of `beginchar' then getdelimited returns the
689        empty string.
690
691        `endchars' is a sequence of allowable end-delimiting characters.
692        Parsing stops when one of these is encountered.
693
694        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
695        within the parsed fragment.
696        """
697        if self.field[self.pos] != beginchar:
698            return ''
699
700        slist = ['']
701        quote = 0
702        self.pos += 1
703        while self.pos < len(self.field):
704            if quote == 1:
705                slist.append(self.field[self.pos])
706                quote = 0
707            elif self.field[self.pos] in endchars:
708                self.pos += 1
709                break
710            elif allowcomments and self.field[self.pos] == '(':
711                slist.append(self.getcomment())
712                continue        # have already advanced pos from getcomment
713            elif self.field[self.pos] == '\\':
714                quote = 1
715            else:
716                slist.append(self.field[self.pos])
717            self.pos += 1
718
719        return ''.join(slist)
720
721    def getquote(self):
722        """Get a quote-delimited fragment from self's field."""
723        return self.getdelimited('"', '"\r', 0)
724
725    def getcomment(self):
726        """Get a parenthesis-delimited fragment from self's field."""
727        return self.getdelimited('(', ')\r', 1)
728
729    def getdomainliteral(self):
730        """Parse an RFC 2822 domain-literal."""
731        return '[%s]' % self.getdelimited('[', ']\r', 0)
732
733    def getatom(self, atomends=None):
734        """Parse an RFC 2822 atom.
735
736        Optional atomends specifies a different set of end token delimiters
737        (the default is to use self.atomends).  This is used e.g. in
738        getphraselist() since phrase endings must not include the `.' (which
739        is legal in phrases)."""
740        atomlist = ['']
741        if atomends is None:
742            atomends = self.atomends
743
744        while self.pos < len(self.field):
745            if self.field[self.pos] in atomends:
746                break
747            else: atomlist.append(self.field[self.pos])
748            self.pos += 1
749
750        return ''.join(atomlist)
751
752    def getphraselist(self):
753        """Parse a sequence of RFC 2822 phrases.
754
755        A phrase is a sequence of words, which are in turn either RFC 2822
756        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
757        runs of continuous whitespace into one space.
758        """
759        plist = []
760
761        while self.pos < len(self.field):
762            if self.field[self.pos] in self.LWS:
763                self.pos += 1
764            elif self.field[self.pos] == '"':
765                plist.append(self.getquote())
766            elif self.field[self.pos] == '(':
767                self.commentlist.append(self.getcomment())
768            elif self.field[self.pos] in self.phraseends:
769                break
770            else:
771                plist.append(self.getatom(self.phraseends))
772
773        return plist
774
775class AddressList(AddrlistClass):
776    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
777    def __init__(self, field):
778        AddrlistClass.__init__(self, field)
779        if field:
780            self.addresslist = self.getaddrlist()
781        else:
782            self.addresslist = []
783
784    def __len__(self):
785        return len(self.addresslist)
786
787    def __str__(self):
788        return ", ".join(map(dump_address_pair, self.addresslist))
789
790    def __add__(self, other):
791        # Set union
792        newaddr = AddressList(None)
793        newaddr.addresslist = self.addresslist[:]
794        for x in other.addresslist:
795            if not x in self.addresslist:
796                newaddr.addresslist.append(x)
797        return newaddr
798
799    def __iadd__(self, other):
800        # Set union, in-place
801        for x in other.addresslist:
802            if not x in self.addresslist:
803                self.addresslist.append(x)
804        return self
805
806    def __sub__(self, other):
807        # Set difference
808        newaddr = AddressList(None)
809        for x in self.addresslist:
810            if not x in other.addresslist:
811                newaddr.addresslist.append(x)
812        return newaddr
813
814    def __isub__(self, other):
815        # Set difference, in-place
816        for x in other.addresslist:
817            if x in self.addresslist:
818                self.addresslist.remove(x)
819        return self
820
821    def __getitem__(self, index):
822        # Make indexing, slices, and 'in' work
823        return self.addresslist[index]
824
825def dump_address_pair(pair):
826    """Dump a (name, address) pair in a canonicalized form."""
827    if pair[0]:
828        return '"' + pair[0] + '" <' + pair[1] + '>'
829    else:
830        return pair[1]
831
832# Parse a date field
833
834_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
835               'aug', 'sep', 'oct', 'nov', 'dec',
836               'january', 'february', 'march', 'april', 'may', 'june', 'july',
837               'august', 'september', 'october', 'november', 'december']
838_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
839
840# The timezone table does not include the military time zones defined
841# in RFC822, other than Z.  According to RFC1123, the description in
842# RFC822 gets the signs wrong, so we can't rely on any such time
843# zones.  RFC1123 recommends that numeric timezone indicators be used
844# instead of timezone names.
845
846_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
847              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
848              'EST': -500, 'EDT': -400,  # Eastern
849              'CST': -600, 'CDT': -500,  # Central
850              'MST': -700, 'MDT': -600,  # Mountain
851              'PST': -800, 'PDT': -700   # Pacific
852              }
853
854
855def parsedate_tz(data):
856    """Convert a date string to a time tuple.
857
858    Accounts for military timezones.
859    """
860    if not data:
861        return None
862    data = data.split()
863    if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
864        # There's a dayname here. Skip it
865        del data[0]
866    else:
867        # no space after the "weekday,"?
868        i = data[0].rfind(',')
869        if i >= 0:
870            data[0] = data[0][i+1:]
871    if len(data) == 3: # RFC 850 date, deprecated
872        stuff = data[0].split('-')
873        if len(stuff) == 3:
874            data = stuff + data[1:]
875    if len(data) == 4:
876        s = data[3]
877        i = s.find('+')
878        if i > 0:
879            data[3:] = [s[:i], s[i+1:]]
880        else:
881            data.append('') # Dummy tz
882    if len(data) < 5:
883        return None
884    data = data[:5]
885    [dd, mm, yy, tm, tz] = data
886    mm = mm.lower()
887    if not mm in _monthnames:
888        dd, mm = mm, dd.lower()
889        if not mm in _monthnames:
890            return None
891    mm = _monthnames.index(mm)+1
892    if mm > 12: mm = mm - 12
893    if dd[-1] == ',':
894        dd = dd[:-1]
895    i = yy.find(':')
896    if i > 0:
897        yy, tm = tm, yy
898    if yy[-1] == ',':
899        yy = yy[:-1]
900    if not yy[0].isdigit():
901        yy, tz = tz, yy
902    if tm[-1] == ',':
903        tm = tm[:-1]
904    tm = tm.split(':')
905    if len(tm) == 2:
906        [thh, tmm] = tm
907        tss = '0'
908    elif len(tm) == 3:
909        [thh, tmm, tss] = tm
910    else:
911        return None
912    try:
913        yy = int(yy)
914        dd = int(dd)
915        thh = int(thh)
916        tmm = int(tmm)
917        tss = int(tss)
918    except ValueError:
919        return None
920    tzoffset = None
921    tz = tz.upper()
922    if tz in _timezones:
923        tzoffset = _timezones[tz]
924    else:
925        try:
926            tzoffset = int(tz)
927        except ValueError:
928            pass
929    # Convert a timezone offset into seconds ; -0500 -> -18000
930    if tzoffset:
931        if tzoffset < 0:
932            tzsign = -1
933            tzoffset = -tzoffset
934        else:
935            tzsign = 1
936        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
937    return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
938
939
940def parsedate(data):
941    """Convert a time string to a time tuple."""
942    t = parsedate_tz(data)
943    if t is None:
944        return t
945    return t[:9]
946
947
948def mktime_tz(data):
949    """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
950    if data[9] is None:
951        # No zone info, so localtime is better assumption than GMT
952        return time.mktime(data[:8] + (-1,))
953    else:
954        t = time.mktime(data[:8] + (0,))
955        return t - data[9] - time.timezone
956
957def formatdate(timeval=None):
958    """Returns time format preferred for Internet standards.
959
960    Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
961
962    According to RFC 1123, day and month names must always be in
963    English.  If not for that, this code could use strftime().  It
964    can't because strftime() honors the locale and could generate
965    non-English names.
966    """
967    if timeval is None:
968        timeval = time.time()
969    timeval = time.gmtime(timeval)
970    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
971            ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
972            timeval[2],
973            ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
974             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
975                                timeval[0], timeval[3], timeval[4], timeval[5])
976
977
978# When used as script, run a small test program.
979# The first command line argument must be a filename containing one
980# message in RFC-822 format.
981
982if __name__ == '__main__':
983    import sys, os
984    file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
985    if sys.argv[1:]: file = sys.argv[1]
986    f = open(file, 'r')
987    m = Message(f)
988    print 'From:', m.getaddr('from')
989    print 'To:', m.getaddrlist('to')
990    print 'Subject:', m.getheader('subject')
991    print 'Date:', m.getheader('date')
992    date = m.getdate_tz('date')
993    tz = date[-1]
994    date = time.localtime(mktime_tz(date))
995    if date:
996        print 'ParsedDate:', time.asctime(date),
997        hhmmss = tz
998        hhmm, ss = divmod(hhmmss, 60)
999        hh, mm = divmod(hhmm, 60)
1000        print "%+03d%02d" % (hh, mm),
1001        if ss: print ".%02d" % ss,
1002        print
1003    else:
1004        print 'ParsedDate:', None
1005    m.rewindbody()
1006    n = 0
1007    while f.readline():
1008        n += 1
1009    print 'Lines:', n
1010    print '-'*70
1011    print 'len =', len(m)
1012    if 'Date' in m: print 'Date =', m['Date']
1013    if 'X-Nonsense' in m: pass
1014    print 'keys =', m.keys()
1015    print 'values =', m.values()
1016    print 'items =', m.items()
1017