1# Copyright (C) 2002-2007 Python Software Foundation 2# Contact: email-sig@python.org 3 4"""Email address parsing code. 5 6Lifted directly from rfc822.py. This should eventually be rewritten. 7""" 8 9__all__ = [ 10 'mktime_tz', 11 'parsedate', 12 'parsedate_tz', 13 'quote', 14 ] 15 16import time 17 18SPACE = ' ' 19EMPTYSTRING = '' 20COMMASPACE = ', ' 21 22# Parse a date field 23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 24 'aug', 'sep', 'oct', 'nov', 'dec', 25 'january', 'february', 'march', 'april', 'may', 'june', 'july', 26 'august', 'september', 'october', 'november', 'december'] 27 28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 29 30# The timezone table does not include the military time zones defined 31# in RFC822, other than Z. According to RFC1123, the description in 32# RFC822 gets the signs wrong, so we can't rely on any such time 33# zones. RFC1123 recommends that numeric timezone indicators be used 34# instead of timezone names. 35 36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 38 'EST': -500, 'EDT': -400, # Eastern 39 'CST': -600, 'CDT': -500, # Central 40 'MST': -700, 'MDT': -600, # Mountain 41 'PST': -800, 'PDT': -700 # Pacific 42 } 43 44 45def parsedate_tz(data): 46 """Convert a date string to a time tuple. 47 48 Accounts for military timezones. 49 """ 50 res = _parsedate_tz(data) 51 if not res: 52 return 53 if res[9] is None: 54 res[9] = 0 55 return tuple(res) 56 57def _parsedate_tz(data): 58 """Convert date to extended time tuple. 59 60 The last (additional) element is the time zone offset in seconds, except if 61 the timezone was specified as -0000. In that case the last element is 62 None. This indicates a UTC timestamp that explicitly declaims knowledge of 63 the source timezone, as opposed to a +0000 timestamp that indicates the 64 source timezone really was UTC. 65 66 """ 67 if not data: 68 return None 69 data = data.split() 70 if not data: # This happens for whitespace-only input. 71 return None 72 # The FWS after the comma after the day-of-week is optional, so search and 73 # adjust for this. 74 if data[0].endswith(',') or data[0].lower() in _daynames: 75 # There's a dayname here. Skip it 76 del data[0] 77 else: 78 i = data[0].rfind(',') 79 if i >= 0: 80 data[0] = data[0][i+1:] 81 if len(data) == 3: # RFC 850 date, deprecated 82 stuff = data[0].split('-') 83 if len(stuff) == 3: 84 data = stuff + data[1:] 85 if len(data) == 4: 86 s = data[3] 87 i = s.find('+') 88 if i == -1: 89 i = s.find('-') 90 if i > 0: 91 data[3:] = [s[:i], s[i:]] 92 else: 93 data.append('') # Dummy tz 94 if len(data) < 5: 95 return None 96 data = data[:5] 97 [dd, mm, yy, tm, tz] = data 98 if not (dd and mm and yy): 99 return None 100 mm = mm.lower() 101 if mm not in _monthnames: 102 dd, mm = mm, dd.lower() 103 if mm not in _monthnames: 104 return None 105 mm = _monthnames.index(mm) + 1 106 if mm > 12: 107 mm -= 12 108 if dd[-1] == ',': 109 dd = dd[:-1] 110 i = yy.find(':') 111 if i > 0: 112 yy, tm = tm, yy 113 if yy[-1] == ',': 114 yy = yy[:-1] 115 if not yy: 116 return None 117 if not yy[0].isdigit(): 118 yy, tz = tz, yy 119 if tm[-1] == ',': 120 tm = tm[:-1] 121 tm = tm.split(':') 122 if len(tm) == 2: 123 [thh, tmm] = tm 124 tss = '0' 125 elif len(tm) == 3: 126 [thh, tmm, tss] = tm 127 elif len(tm) == 1 and '.' in tm[0]: 128 # Some non-compliant MUAs use '.' to separate time elements. 129 tm = tm[0].split('.') 130 if len(tm) == 2: 131 [thh, tmm] = tm 132 tss = 0 133 elif len(tm) == 3: 134 [thh, tmm, tss] = tm 135 else: 136 return None 137 else: 138 return None 139 try: 140 yy = int(yy) 141 dd = int(dd) 142 thh = int(thh) 143 tmm = int(tmm) 144 tss = int(tss) 145 except ValueError: 146 return None 147 # Check for a yy specified in two-digit format, then convert it to the 148 # appropriate four-digit format, according to the POSIX standard. RFC 822 149 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 150 # mandates a 4-digit yy. For more information, see the documentation for 151 # the time module. 152 if yy < 100: 153 # The year is between 1969 and 1999 (inclusive). 154 if yy > 68: 155 yy += 1900 156 # The year is between 2000 and 2068 (inclusive). 157 else: 158 yy += 2000 159 tzoffset = None 160 tz = tz.upper() 161 if tz in _timezones: 162 tzoffset = _timezones[tz] 163 else: 164 try: 165 tzoffset = int(tz) 166 except ValueError: 167 pass 168 if tzoffset==0 and tz.startswith('-'): 169 tzoffset = None 170 # Convert a timezone offset into seconds ; -0500 -> -18000 171 if tzoffset: 172 if tzoffset < 0: 173 tzsign = -1 174 tzoffset = -tzoffset 175 else: 176 tzsign = 1 177 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 178 # Daylight Saving Time flag is set to -1, since DST is unknown. 179 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] 180 181 182def parsedate(data): 183 """Convert a time string to a time tuple.""" 184 t = parsedate_tz(data) 185 if isinstance(t, tuple): 186 return t[:9] 187 else: 188 return t 189 190 191def mktime_tz(data): 192 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 193 if data[9] is None: 194 # No zone info, so localtime is better assumption than GMT 195 return time.mktime(data[:8] + (-1,)) 196 else: 197 # Delay the import, since mktime_tz is rarely used 198 import calendar 199 200 t = calendar.timegm(data) 201 return t - data[9] 202 203 204def quote(str): 205 """Prepare string to be used in a quoted string. 206 207 Turns backslash and double quote characters into quoted pairs. These 208 are the only characters that need to be quoted inside a quoted string. 209 Does not add the surrounding double quotes. 210 """ 211 return str.replace('\\', '\\\\').replace('"', '\\"') 212 213 214class AddrlistClass: 215 """Address parser class by Ben Escoto. 216 217 To understand what this class does, it helps to have a copy of RFC 2822 in 218 front of you. 219 220 Note: this class interface is deprecated and may be removed in the future. 221 Use email.utils.AddressList instead. 222 """ 223 224 def __init__(self, field): 225 """Initialize a new instance. 226 227 `field' is an unparsed address header field, containing 228 one or more addresses. 229 """ 230 self.specials = '()<>@,:;.\"[]' 231 self.pos = 0 232 self.LWS = ' \t' 233 self.CR = '\r\n' 234 self.FWS = self.LWS + self.CR 235 self.atomends = self.specials + self.LWS + self.CR 236 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 237 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 238 # syntax, so allow dots in phrases. 239 self.phraseends = self.atomends.replace('.', '') 240 self.field = field 241 self.commentlist = [] 242 243 def gotonext(self): 244 """Skip white space and extract comments.""" 245 wslist = [] 246 while self.pos < len(self.field): 247 if self.field[self.pos] in self.LWS + '\n\r': 248 if self.field[self.pos] not in '\n\r': 249 wslist.append(self.field[self.pos]) 250 self.pos += 1 251 elif self.field[self.pos] == '(': 252 self.commentlist.append(self.getcomment()) 253 else: 254 break 255 return EMPTYSTRING.join(wslist) 256 257 def getaddrlist(self): 258 """Parse all addresses. 259 260 Returns a list containing all of the addresses. 261 """ 262 result = [] 263 while self.pos < len(self.field): 264 ad = self.getaddress() 265 if ad: 266 result += ad 267 else: 268 result.append(('', '')) 269 return result 270 271 def getaddress(self): 272 """Parse the next address.""" 273 self.commentlist = [] 274 self.gotonext() 275 276 oldpos = self.pos 277 oldcl = self.commentlist 278 plist = self.getphraselist() 279 280 self.gotonext() 281 returnlist = [] 282 283 if self.pos >= len(self.field): 284 # Bad email address technically, no domain. 285 if plist: 286 returnlist = [(SPACE.join(self.commentlist), plist[0])] 287 288 elif self.field[self.pos] in '.@': 289 # email address is just an addrspec 290 # this isn't very efficient since we start over 291 self.pos = oldpos 292 self.commentlist = oldcl 293 addrspec = self.getaddrspec() 294 returnlist = [(SPACE.join(self.commentlist), addrspec)] 295 296 elif self.field[self.pos] == ':': 297 # address is a group 298 returnlist = [] 299 300 fieldlen = len(self.field) 301 self.pos += 1 302 while self.pos < len(self.field): 303 self.gotonext() 304 if self.pos < fieldlen and self.field[self.pos] == ';': 305 self.pos += 1 306 break 307 returnlist = returnlist + self.getaddress() 308 309 elif self.field[self.pos] == '<': 310 # Address is a phrase then a route addr 311 routeaddr = self.getrouteaddr() 312 313 if self.commentlist: 314 returnlist = [(SPACE.join(plist) + ' (' + 315 ' '.join(self.commentlist) + ')', routeaddr)] 316 else: 317 returnlist = [(SPACE.join(plist), routeaddr)] 318 319 else: 320 if plist: 321 returnlist = [(SPACE.join(self.commentlist), plist[0])] 322 elif self.field[self.pos] in self.specials: 323 self.pos += 1 324 325 self.gotonext() 326 if self.pos < len(self.field) and self.field[self.pos] == ',': 327 self.pos += 1 328 return returnlist 329 330 def getrouteaddr(self): 331 """Parse a route address (Return-path value). 332 333 This method just skips all the route stuff and returns the addrspec. 334 """ 335 if self.field[self.pos] != '<': 336 return 337 338 expectroute = False 339 self.pos += 1 340 self.gotonext() 341 adlist = '' 342 while self.pos < len(self.field): 343 if expectroute: 344 self.getdomain() 345 expectroute = False 346 elif self.field[self.pos] == '>': 347 self.pos += 1 348 break 349 elif self.field[self.pos] == '@': 350 self.pos += 1 351 expectroute = True 352 elif self.field[self.pos] == ':': 353 self.pos += 1 354 else: 355 adlist = self.getaddrspec() 356 self.pos += 1 357 break 358 self.gotonext() 359 360 return adlist 361 362 def getaddrspec(self): 363 """Parse an RFC 2822 addr-spec.""" 364 aslist = [] 365 366 self.gotonext() 367 while self.pos < len(self.field): 368 preserve_ws = True 369 if self.field[self.pos] == '.': 370 if aslist and not aslist[-1].strip(): 371 aslist.pop() 372 aslist.append('.') 373 self.pos += 1 374 preserve_ws = False 375 elif self.field[self.pos] == '"': 376 aslist.append('"%s"' % quote(self.getquote())) 377 elif self.field[self.pos] in self.atomends: 378 if aslist and not aslist[-1].strip(): 379 aslist.pop() 380 break 381 else: 382 aslist.append(self.getatom()) 383 ws = self.gotonext() 384 if preserve_ws and ws: 385 aslist.append(ws) 386 387 if self.pos >= len(self.field) or self.field[self.pos] != '@': 388 return EMPTYSTRING.join(aslist) 389 390 aslist.append('@') 391 self.pos += 1 392 self.gotonext() 393 domain = self.getdomain() 394 if not domain: 395 # Invalid domain, return an empty address instead of returning a 396 # local part to denote failed parsing. 397 return EMPTYSTRING 398 return EMPTYSTRING.join(aslist) + domain 399 400 def getdomain(self): 401 """Get the complete domain name from an address.""" 402 sdlist = [] 403 while self.pos < len(self.field): 404 if self.field[self.pos] in self.LWS: 405 self.pos += 1 406 elif self.field[self.pos] == '(': 407 self.commentlist.append(self.getcomment()) 408 elif self.field[self.pos] == '[': 409 sdlist.append(self.getdomainliteral()) 410 elif self.field[self.pos] == '.': 411 self.pos += 1 412 sdlist.append('.') 413 elif self.field[self.pos] == '@': 414 # bpo-34155: Don't parse domains with two `@` like 415 # `a@malicious.org@important.com`. 416 return EMPTYSTRING 417 elif self.field[self.pos] in self.atomends: 418 break 419 else: 420 sdlist.append(self.getatom()) 421 return EMPTYSTRING.join(sdlist) 422 423 def getdelimited(self, beginchar, endchars, allowcomments=True): 424 """Parse a header fragment delimited by special characters. 425 426 `beginchar' is the start character for the fragment. 427 If self is not looking at an instance of `beginchar' then 428 getdelimited returns the empty string. 429 430 `endchars' is a sequence of allowable end-delimiting characters. 431 Parsing stops when one of these is encountered. 432 433 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 434 within the parsed fragment. 435 """ 436 if self.field[self.pos] != beginchar: 437 return '' 438 439 slist = [''] 440 quote = False 441 self.pos += 1 442 while self.pos < len(self.field): 443 if quote: 444 slist.append(self.field[self.pos]) 445 quote = False 446 elif self.field[self.pos] in endchars: 447 self.pos += 1 448 break 449 elif allowcomments and self.field[self.pos] == '(': 450 slist.append(self.getcomment()) 451 continue # have already advanced pos from getcomment 452 elif self.field[self.pos] == '\\': 453 quote = True 454 else: 455 slist.append(self.field[self.pos]) 456 self.pos += 1 457 458 return EMPTYSTRING.join(slist) 459 460 def getquote(self): 461 """Get a quote-delimited fragment from self's field.""" 462 return self.getdelimited('"', '"\r', False) 463 464 def getcomment(self): 465 """Get a parenthesis-delimited fragment from self's field.""" 466 return self.getdelimited('(', ')\r', True) 467 468 def getdomainliteral(self): 469 """Parse an RFC 2822 domain-literal.""" 470 return '[%s]' % self.getdelimited('[', ']\r', False) 471 472 def getatom(self, atomends=None): 473 """Parse an RFC 2822 atom. 474 475 Optional atomends specifies a different set of end token delimiters 476 (the default is to use self.atomends). This is used e.g. in 477 getphraselist() since phrase endings must not include the `.' (which 478 is legal in phrases).""" 479 atomlist = [''] 480 if atomends is None: 481 atomends = self.atomends 482 483 while self.pos < len(self.field): 484 if self.field[self.pos] in atomends: 485 break 486 else: 487 atomlist.append(self.field[self.pos]) 488 self.pos += 1 489 490 return EMPTYSTRING.join(atomlist) 491 492 def getphraselist(self): 493 """Parse a sequence of RFC 2822 phrases. 494 495 A phrase is a sequence of words, which are in turn either RFC 2822 496 atoms or quoted-strings. Phrases are canonicalized by squeezing all 497 runs of continuous whitespace into one space. 498 """ 499 plist = [] 500 501 while self.pos < len(self.field): 502 if self.field[self.pos] in self.FWS: 503 self.pos += 1 504 elif self.field[self.pos] == '"': 505 plist.append(self.getquote()) 506 elif self.field[self.pos] == '(': 507 self.commentlist.append(self.getcomment()) 508 elif self.field[self.pos] in self.phraseends: 509 break 510 else: 511 plist.append(self.getatom(self.phraseends)) 512 513 return plist 514 515class AddressList(AddrlistClass): 516 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 517 def __init__(self, field): 518 AddrlistClass.__init__(self, field) 519 if field: 520 self.addresslist = self.getaddrlist() 521 else: 522 self.addresslist = [] 523 524 def __len__(self): 525 return len(self.addresslist) 526 527 def __add__(self, other): 528 # Set union 529 newaddr = AddressList(None) 530 newaddr.addresslist = self.addresslist[:] 531 for x in other.addresslist: 532 if not x in self.addresslist: 533 newaddr.addresslist.append(x) 534 return newaddr 535 536 def __iadd__(self, other): 537 # Set union, in-place 538 for x in other.addresslist: 539 if not x in self.addresslist: 540 self.addresslist.append(x) 541 return self 542 543 def __sub__(self, other): 544 # Set difference 545 newaddr = AddressList(None) 546 for x in self.addresslist: 547 if not x in other.addresslist: 548 newaddr.addresslist.append(x) 549 return newaddr 550 551 def __isub__(self, other): 552 # Set difference, in-place 553 for x in other.addresslist: 554 if x in self.addresslist: 555 self.addresslist.remove(x) 556 return self 557 558 def __getitem__(self, index): 559 # Make indexing, slices, and 'in' work 560 return self.addresslist[index] 561