• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py.  This should eventually be rewritten.
7"""
8
9__all__ = [
10    'mktime_tz',
11    'parsedate',
12    'parsedate_tz',
13    'quote',
14    ]
15
16import time, calendar
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24               'aug', 'sep', 'oct', 'nov', 'dec',
25               'january', 'february', 'march', 'april', 'may', 'june', 'july',
26               'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z.  According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones.  RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
38              'EST': -500, 'EDT': -400,  # Eastern
39              'CST': -600, 'CDT': -500,  # Central
40              'MST': -700, 'MDT': -600,  # Mountain
41              'PST': -800, 'PDT': -700   # Pacific
42              }
43
44
45def parsedate_tz(data):
46    """Convert a date string to a time tuple.
47
48    Accounts for military timezones.
49    """
50    res = _parsedate_tz(data)
51    if not res:
52        return
53    if res[9] is None:
54        res[9] = 0
55    return tuple(res)
56
57def _parsedate_tz(data):
58    """Convert date to extended time tuple.
59
60    The last (additional) element is the time zone offset in seconds, except if
61    the timezone was specified as -0000.  In that case the last element is
62    None.  This indicates a UTC timestamp that explicitly declaims knowledge of
63    the source timezone, as opposed to a +0000 timestamp that indicates the
64    source timezone really was UTC.
65
66    """
67    if not data:
68        return None
69    data = data.split()
70    if not data:  # This happens for whitespace-only input.
71        return None
72    # The FWS after the comma after the day-of-week is optional, so search and
73    # adjust for this.
74    if data[0].endswith(',') or data[0].lower() in _daynames:
75        # There's a dayname here. Skip it
76        del data[0]
77    else:
78        i = data[0].rfind(',')
79        if i >= 0:
80            data[0] = data[0][i+1:]
81    if len(data) == 3: # RFC 850 date, deprecated
82        stuff = data[0].split('-')
83        if len(stuff) == 3:
84            data = stuff + data[1:]
85    if len(data) == 4:
86        s = data[3]
87        i = s.find('+')
88        if i == -1:
89            i = s.find('-')
90        if i > 0:
91            data[3:] = [s[:i], s[i:]]
92        else:
93            data.append('') # Dummy tz
94    if len(data) < 5:
95        return None
96    data = data[:5]
97    [dd, mm, yy, tm, tz] = data
98    mm = mm.lower()
99    if mm not in _monthnames:
100        dd, mm = mm, dd.lower()
101        if mm not in _monthnames:
102            return None
103    mm = _monthnames.index(mm) + 1
104    if mm > 12:
105        mm -= 12
106    if dd[-1] == ',':
107        dd = dd[:-1]
108    i = yy.find(':')
109    if i > 0:
110        yy, tm = tm, yy
111    if yy[-1] == ',':
112        yy = yy[:-1]
113    if not yy[0].isdigit():
114        yy, tz = tz, yy
115    if tm[-1] == ',':
116        tm = tm[:-1]
117    tm = tm.split(':')
118    if len(tm) == 2:
119        [thh, tmm] = tm
120        tss = '0'
121    elif len(tm) == 3:
122        [thh, tmm, tss] = tm
123    elif len(tm) == 1 and '.' in tm[0]:
124        # Some non-compliant MUAs use '.' to separate time elements.
125        tm = tm[0].split('.')
126        if len(tm) == 2:
127            [thh, tmm] = tm
128            tss = 0
129        elif len(tm) == 3:
130            [thh, tmm, tss] = tm
131        else:
132            return None
133    else:
134        return None
135    try:
136        yy = int(yy)
137        dd = int(dd)
138        thh = int(thh)
139        tmm = int(tmm)
140        tss = int(tss)
141    except ValueError:
142        return None
143    # Check for a yy specified in two-digit format, then convert it to the
144    # appropriate four-digit format, according to the POSIX standard. RFC 822
145    # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
146    # mandates a 4-digit yy. For more information, see the documentation for
147    # the time module.
148    if yy < 100:
149        # The year is between 1969 and 1999 (inclusive).
150        if yy > 68:
151            yy += 1900
152        # The year is between 2000 and 2068 (inclusive).
153        else:
154            yy += 2000
155    tzoffset = None
156    tz = tz.upper()
157    if tz in _timezones:
158        tzoffset = _timezones[tz]
159    else:
160        try:
161            tzoffset = int(tz)
162        except ValueError:
163            pass
164        if tzoffset==0 and tz.startswith('-'):
165            tzoffset = None
166    # Convert a timezone offset into seconds ; -0500 -> -18000
167    if tzoffset:
168        if tzoffset < 0:
169            tzsign = -1
170            tzoffset = -tzoffset
171        else:
172            tzsign = 1
173        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
174    # Daylight Saving Time flag is set to -1, since DST is unknown.
175    return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
176
177
178def parsedate(data):
179    """Convert a time string to a time tuple."""
180    t = parsedate_tz(data)
181    if isinstance(t, tuple):
182        return t[:9]
183    else:
184        return t
185
186
187def mktime_tz(data):
188    """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
189    if data[9] is None:
190        # No zone info, so localtime is better assumption than GMT
191        return time.mktime(data[:8] + (-1,))
192    else:
193        t = calendar.timegm(data)
194        return t - data[9]
195
196
197def quote(str):
198    """Prepare string to be used in a quoted string.
199
200    Turns backslash and double quote characters into quoted pairs.  These
201    are the only characters that need to be quoted inside a quoted string.
202    Does not add the surrounding double quotes.
203    """
204    return str.replace('\\', '\\\\').replace('"', '\\"')
205
206
207class AddrlistClass:
208    """Address parser class by Ben Escoto.
209
210    To understand what this class does, it helps to have a copy of RFC 2822 in
211    front of you.
212
213    Note: this class interface is deprecated and may be removed in the future.
214    Use email.utils.AddressList instead.
215    """
216
217    def __init__(self, field):
218        """Initialize a new instance.
219
220        `field' is an unparsed address header field, containing
221        one or more addresses.
222        """
223        self.specials = '()<>@,:;.\"[]'
224        self.pos = 0
225        self.LWS = ' \t'
226        self.CR = '\r\n'
227        self.FWS = self.LWS + self.CR
228        self.atomends = self.specials + self.LWS + self.CR
229        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
230        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
231        # syntax, so allow dots in phrases.
232        self.phraseends = self.atomends.replace('.', '')
233        self.field = field
234        self.commentlist = []
235
236    def gotonext(self):
237        """Skip white space and extract comments."""
238        wslist = []
239        while self.pos < len(self.field):
240            if self.field[self.pos] in self.LWS + '\n\r':
241                if self.field[self.pos] not in '\n\r':
242                    wslist.append(self.field[self.pos])
243                self.pos += 1
244            elif self.field[self.pos] == '(':
245                self.commentlist.append(self.getcomment())
246            else:
247                break
248        return EMPTYSTRING.join(wslist)
249
250    def getaddrlist(self):
251        """Parse all addresses.
252
253        Returns a list containing all of the addresses.
254        """
255        result = []
256        while self.pos < len(self.field):
257            ad = self.getaddress()
258            if ad:
259                result += ad
260            else:
261                result.append(('', ''))
262        return result
263
264    def getaddress(self):
265        """Parse the next address."""
266        self.commentlist = []
267        self.gotonext()
268
269        oldpos = self.pos
270        oldcl = self.commentlist
271        plist = self.getphraselist()
272
273        self.gotonext()
274        returnlist = []
275
276        if self.pos >= len(self.field):
277            # Bad email address technically, no domain.
278            if plist:
279                returnlist = [(SPACE.join(self.commentlist), plist[0])]
280
281        elif self.field[self.pos] in '.@':
282            # email address is just an addrspec
283            # this isn't very efficient since we start over
284            self.pos = oldpos
285            self.commentlist = oldcl
286            addrspec = self.getaddrspec()
287            returnlist = [(SPACE.join(self.commentlist), addrspec)]
288
289        elif self.field[self.pos] == ':':
290            # address is a group
291            returnlist = []
292
293            fieldlen = len(self.field)
294            self.pos += 1
295            while self.pos < len(self.field):
296                self.gotonext()
297                if self.pos < fieldlen and self.field[self.pos] == ';':
298                    self.pos += 1
299                    break
300                returnlist = returnlist + self.getaddress()
301
302        elif self.field[self.pos] == '<':
303            # Address is a phrase then a route addr
304            routeaddr = self.getrouteaddr()
305
306            if self.commentlist:
307                returnlist = [(SPACE.join(plist) + ' (' +
308                               ' '.join(self.commentlist) + ')', routeaddr)]
309            else:
310                returnlist = [(SPACE.join(plist), routeaddr)]
311
312        else:
313            if plist:
314                returnlist = [(SPACE.join(self.commentlist), plist[0])]
315            elif self.field[self.pos] in self.specials:
316                self.pos += 1
317
318        self.gotonext()
319        if self.pos < len(self.field) and self.field[self.pos] == ',':
320            self.pos += 1
321        return returnlist
322
323    def getrouteaddr(self):
324        """Parse a route address (Return-path value).
325
326        This method just skips all the route stuff and returns the addrspec.
327        """
328        if self.field[self.pos] != '<':
329            return
330
331        expectroute = False
332        self.pos += 1
333        self.gotonext()
334        adlist = ''
335        while self.pos < len(self.field):
336            if expectroute:
337                self.getdomain()
338                expectroute = False
339            elif self.field[self.pos] == '>':
340                self.pos += 1
341                break
342            elif self.field[self.pos] == '@':
343                self.pos += 1
344                expectroute = True
345            elif self.field[self.pos] == ':':
346                self.pos += 1
347            else:
348                adlist = self.getaddrspec()
349                self.pos += 1
350                break
351            self.gotonext()
352
353        return adlist
354
355    def getaddrspec(self):
356        """Parse an RFC 2822 addr-spec."""
357        aslist = []
358
359        self.gotonext()
360        while self.pos < len(self.field):
361            preserve_ws = True
362            if self.field[self.pos] == '.':
363                if aslist and not aslist[-1].strip():
364                    aslist.pop()
365                aslist.append('.')
366                self.pos += 1
367                preserve_ws = False
368            elif self.field[self.pos] == '"':
369                aslist.append('"%s"' % quote(self.getquote()))
370            elif self.field[self.pos] in self.atomends:
371                if aslist and not aslist[-1].strip():
372                    aslist.pop()
373                break
374            else:
375                aslist.append(self.getatom())
376            ws = self.gotonext()
377            if preserve_ws and ws:
378                aslist.append(ws)
379
380        if self.pos >= len(self.field) or self.field[self.pos] != '@':
381            return EMPTYSTRING.join(aslist)
382
383        aslist.append('@')
384        self.pos += 1
385        self.gotonext()
386        domain = self.getdomain()
387        if not domain:
388            # Invalid domain, return an empty address instead of returning a
389            # local part to denote failed parsing.
390            return EMPTYSTRING
391        return EMPTYSTRING.join(aslist) + domain
392
393    def getdomain(self):
394        """Get the complete domain name from an address."""
395        sdlist = []
396        while self.pos < len(self.field):
397            if self.field[self.pos] in self.LWS:
398                self.pos += 1
399            elif self.field[self.pos] == '(':
400                self.commentlist.append(self.getcomment())
401            elif self.field[self.pos] == '[':
402                sdlist.append(self.getdomainliteral())
403            elif self.field[self.pos] == '.':
404                self.pos += 1
405                sdlist.append('.')
406            elif self.field[self.pos] == '@':
407                # bpo-34155: Don't parse domains with two `@` like
408                # `a@malicious.org@important.com`.
409                return EMPTYSTRING
410            elif self.field[self.pos] in self.atomends:
411                break
412            else:
413                sdlist.append(self.getatom())
414        return EMPTYSTRING.join(sdlist)
415
416    def getdelimited(self, beginchar, endchars, allowcomments=True):
417        """Parse a header fragment delimited by special characters.
418
419        `beginchar' is the start character for the fragment.
420        If self is not looking at an instance of `beginchar' then
421        getdelimited returns the empty string.
422
423        `endchars' is a sequence of allowable end-delimiting characters.
424        Parsing stops when one of these is encountered.
425
426        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
427        within the parsed fragment.
428        """
429        if self.field[self.pos] != beginchar:
430            return ''
431
432        slist = ['']
433        quote = False
434        self.pos += 1
435        while self.pos < len(self.field):
436            if quote:
437                slist.append(self.field[self.pos])
438                quote = False
439            elif self.field[self.pos] in endchars:
440                self.pos += 1
441                break
442            elif allowcomments and self.field[self.pos] == '(':
443                slist.append(self.getcomment())
444                continue        # have already advanced pos from getcomment
445            elif self.field[self.pos] == '\\':
446                quote = True
447            else:
448                slist.append(self.field[self.pos])
449            self.pos += 1
450
451        return EMPTYSTRING.join(slist)
452
453    def getquote(self):
454        """Get a quote-delimited fragment from self's field."""
455        return self.getdelimited('"', '"\r', False)
456
457    def getcomment(self):
458        """Get a parenthesis-delimited fragment from self's field."""
459        return self.getdelimited('(', ')\r', True)
460
461    def getdomainliteral(self):
462        """Parse an RFC 2822 domain-literal."""
463        return '[%s]' % self.getdelimited('[', ']\r', False)
464
465    def getatom(self, atomends=None):
466        """Parse an RFC 2822 atom.
467
468        Optional atomends specifies a different set of end token delimiters
469        (the default is to use self.atomends).  This is used e.g. in
470        getphraselist() since phrase endings must not include the `.' (which
471        is legal in phrases)."""
472        atomlist = ['']
473        if atomends is None:
474            atomends = self.atomends
475
476        while self.pos < len(self.field):
477            if self.field[self.pos] in atomends:
478                break
479            else:
480                atomlist.append(self.field[self.pos])
481            self.pos += 1
482
483        return EMPTYSTRING.join(atomlist)
484
485    def getphraselist(self):
486        """Parse a sequence of RFC 2822 phrases.
487
488        A phrase is a sequence of words, which are in turn either RFC 2822
489        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
490        runs of continuous whitespace into one space.
491        """
492        plist = []
493
494        while self.pos < len(self.field):
495            if self.field[self.pos] in self.FWS:
496                self.pos += 1
497            elif self.field[self.pos] == '"':
498                plist.append(self.getquote())
499            elif self.field[self.pos] == '(':
500                self.commentlist.append(self.getcomment())
501            elif self.field[self.pos] in self.phraseends:
502                break
503            else:
504                plist.append(self.getatom(self.phraseends))
505
506        return plist
507
508class AddressList(AddrlistClass):
509    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
510    def __init__(self, field):
511        AddrlistClass.__init__(self, field)
512        if field:
513            self.addresslist = self.getaddrlist()
514        else:
515            self.addresslist = []
516
517    def __len__(self):
518        return len(self.addresslist)
519
520    def __add__(self, other):
521        # Set union
522        newaddr = AddressList(None)
523        newaddr.addresslist = self.addresslist[:]
524        for x in other.addresslist:
525            if not x in self.addresslist:
526                newaddr.addresslist.append(x)
527        return newaddr
528
529    def __iadd__(self, other):
530        # Set union, in-place
531        for x in other.addresslist:
532            if not x in self.addresslist:
533                self.addresslist.append(x)
534        return self
535
536    def __sub__(self, other):
537        # Set difference
538        newaddr = AddressList(None)
539        for x in self.addresslist:
540            if not x in other.addresslist:
541                newaddr.addresslist.append(x)
542        return newaddr
543
544    def __isub__(self, other):
545        # Set difference, in-place
546        for x in other.addresslist:
547            if x in self.addresslist:
548                self.addresslist.remove(x)
549        return self
550
551    def __getitem__(self, index):
552        # Make indexing, slices, and 'in' work
553        return self.addresslist[index]
554