1# Copyright (C) 2002-2007 Python Software Foundation 2# Contact: email-sig@python.org 3 4"""Email address parsing code. 5 6Lifted directly from rfc822.py. This should eventually be rewritten. 7""" 8 9__all__ = [ 10 'mktime_tz', 11 'parsedate', 12 'parsedate_tz', 13 'quote', 14 ] 15 16import time, calendar 17 18SPACE = ' ' 19EMPTYSTRING = '' 20COMMASPACE = ', ' 21 22# Parse a date field 23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 24 'aug', 'sep', 'oct', 'nov', 'dec', 25 'january', 'february', 'march', 'april', 'may', 'june', 'july', 26 'august', 'september', 'october', 'november', 'december'] 27 28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 29 30# The timezone table does not include the military time zones defined 31# in RFC822, other than Z. According to RFC1123, the description in 32# RFC822 gets the signs wrong, so we can't rely on any such time 33# zones. RFC1123 recommends that numeric timezone indicators be used 34# instead of timezone names. 35 36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 38 'EST': -500, 'EDT': -400, # Eastern 39 'CST': -600, 'CDT': -500, # Central 40 'MST': -700, 'MDT': -600, # Mountain 41 'PST': -800, 'PDT': -700 # Pacific 42 } 43 44 45def parsedate_tz(data): 46 """Convert a date string to a time tuple. 47 48 Accounts for military timezones. 49 """ 50 res = _parsedate_tz(data) 51 if not res: 52 return 53 if res[9] is None: 54 res[9] = 0 55 return tuple(res) 56 57def _parsedate_tz(data): 58 """Convert date to extended time tuple. 59 60 The last (additional) element is the time zone offset in seconds, except if 61 the timezone was specified as -0000. In that case the last element is 62 None. This indicates a UTC timestamp that explicitly declaims knowledge of 63 the source timezone, as opposed to a +0000 timestamp that indicates the 64 source timezone really was UTC. 65 66 """ 67 if not data: 68 return None 69 data = data.split() 70 if not data: # This happens for whitespace-only input. 71 return None 72 # The FWS after the comma after the day-of-week is optional, so search and 73 # adjust for this. 74 if data[0].endswith(',') or data[0].lower() in _daynames: 75 # There's a dayname here. Skip it 76 del data[0] 77 else: 78 i = data[0].rfind(',') 79 if i >= 0: 80 data[0] = data[0][i+1:] 81 if len(data) == 3: # RFC 850 date, deprecated 82 stuff = data[0].split('-') 83 if len(stuff) == 3: 84 data = stuff + data[1:] 85 if len(data) == 4: 86 s = data[3] 87 i = s.find('+') 88 if i == -1: 89 i = s.find('-') 90 if i > 0: 91 data[3:] = [s[:i], s[i:]] 92 else: 93 data.append('') # Dummy tz 94 if len(data) < 5: 95 return None 96 data = data[:5] 97 [dd, mm, yy, tm, tz] = data 98 mm = mm.lower() 99 if mm not in _monthnames: 100 dd, mm = mm, dd.lower() 101 if mm not in _monthnames: 102 return None 103 mm = _monthnames.index(mm) + 1 104 if mm > 12: 105 mm -= 12 106 if dd[-1] == ',': 107 dd = dd[:-1] 108 i = yy.find(':') 109 if i > 0: 110 yy, tm = tm, yy 111 if yy[-1] == ',': 112 yy = yy[:-1] 113 if not yy[0].isdigit(): 114 yy, tz = tz, yy 115 if tm[-1] == ',': 116 tm = tm[:-1] 117 tm = tm.split(':') 118 if len(tm) == 2: 119 [thh, tmm] = tm 120 tss = '0' 121 elif len(tm) == 3: 122 [thh, tmm, tss] = tm 123 elif len(tm) == 1 and '.' in tm[0]: 124 # Some non-compliant MUAs use '.' to separate time elements. 125 tm = tm[0].split('.') 126 if len(tm) == 2: 127 [thh, tmm] = tm 128 tss = 0 129 elif len(tm) == 3: 130 [thh, tmm, tss] = tm 131 else: 132 return None 133 else: 134 return None 135 try: 136 yy = int(yy) 137 dd = int(dd) 138 thh = int(thh) 139 tmm = int(tmm) 140 tss = int(tss) 141 except ValueError: 142 return None 143 # Check for a yy specified in two-digit format, then convert it to the 144 # appropriate four-digit format, according to the POSIX standard. RFC 822 145 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 146 # mandates a 4-digit yy. For more information, see the documentation for 147 # the time module. 148 if yy < 100: 149 # The year is between 1969 and 1999 (inclusive). 150 if yy > 68: 151 yy += 1900 152 # The year is between 2000 and 2068 (inclusive). 153 else: 154 yy += 2000 155 tzoffset = None 156 tz = tz.upper() 157 if tz in _timezones: 158 tzoffset = _timezones[tz] 159 else: 160 try: 161 tzoffset = int(tz) 162 except ValueError: 163 pass 164 if tzoffset==0 and tz.startswith('-'): 165 tzoffset = None 166 # Convert a timezone offset into seconds ; -0500 -> -18000 167 if tzoffset: 168 if tzoffset < 0: 169 tzsign = -1 170 tzoffset = -tzoffset 171 else: 172 tzsign = 1 173 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 174 # Daylight Saving Time flag is set to -1, since DST is unknown. 175 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] 176 177 178def parsedate(data): 179 """Convert a time string to a time tuple.""" 180 t = parsedate_tz(data) 181 if isinstance(t, tuple): 182 return t[:9] 183 else: 184 return t 185 186 187def mktime_tz(data): 188 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 189 if data[9] is None: 190 # No zone info, so localtime is better assumption than GMT 191 return time.mktime(data[:8] + (-1,)) 192 else: 193 t = calendar.timegm(data) 194 return t - data[9] 195 196 197def quote(str): 198 """Prepare string to be used in a quoted string. 199 200 Turns backslash and double quote characters into quoted pairs. These 201 are the only characters that need to be quoted inside a quoted string. 202 Does not add the surrounding double quotes. 203 """ 204 return str.replace('\\', '\\\\').replace('"', '\\"') 205 206 207class AddrlistClass: 208 """Address parser class by Ben Escoto. 209 210 To understand what this class does, it helps to have a copy of RFC 2822 in 211 front of you. 212 213 Note: this class interface is deprecated and may be removed in the future. 214 Use email.utils.AddressList instead. 215 """ 216 217 def __init__(self, field): 218 """Initialize a new instance. 219 220 `field' is an unparsed address header field, containing 221 one or more addresses. 222 """ 223 self.specials = '()<>@,:;.\"[]' 224 self.pos = 0 225 self.LWS = ' \t' 226 self.CR = '\r\n' 227 self.FWS = self.LWS + self.CR 228 self.atomends = self.specials + self.LWS + self.CR 229 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 230 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 231 # syntax, so allow dots in phrases. 232 self.phraseends = self.atomends.replace('.', '') 233 self.field = field 234 self.commentlist = [] 235 236 def gotonext(self): 237 """Skip white space and extract comments.""" 238 wslist = [] 239 while self.pos < len(self.field): 240 if self.field[self.pos] in self.LWS + '\n\r': 241 if self.field[self.pos] not in '\n\r': 242 wslist.append(self.field[self.pos]) 243 self.pos += 1 244 elif self.field[self.pos] == '(': 245 self.commentlist.append(self.getcomment()) 246 else: 247 break 248 return EMPTYSTRING.join(wslist) 249 250 def getaddrlist(self): 251 """Parse all addresses. 252 253 Returns a list containing all of the addresses. 254 """ 255 result = [] 256 while self.pos < len(self.field): 257 ad = self.getaddress() 258 if ad: 259 result += ad 260 else: 261 result.append(('', '')) 262 return result 263 264 def getaddress(self): 265 """Parse the next address.""" 266 self.commentlist = [] 267 self.gotonext() 268 269 oldpos = self.pos 270 oldcl = self.commentlist 271 plist = self.getphraselist() 272 273 self.gotonext() 274 returnlist = [] 275 276 if self.pos >= len(self.field): 277 # Bad email address technically, no domain. 278 if plist: 279 returnlist = [(SPACE.join(self.commentlist), plist[0])] 280 281 elif self.field[self.pos] in '.@': 282 # email address is just an addrspec 283 # this isn't very efficient since we start over 284 self.pos = oldpos 285 self.commentlist = oldcl 286 addrspec = self.getaddrspec() 287 returnlist = [(SPACE.join(self.commentlist), addrspec)] 288 289 elif self.field[self.pos] == ':': 290 # address is a group 291 returnlist = [] 292 293 fieldlen = len(self.field) 294 self.pos += 1 295 while self.pos < len(self.field): 296 self.gotonext() 297 if self.pos < fieldlen and self.field[self.pos] == ';': 298 self.pos += 1 299 break 300 returnlist = returnlist + self.getaddress() 301 302 elif self.field[self.pos] == '<': 303 # Address is a phrase then a route addr 304 routeaddr = self.getrouteaddr() 305 306 if self.commentlist: 307 returnlist = [(SPACE.join(plist) + ' (' + 308 ' '.join(self.commentlist) + ')', routeaddr)] 309 else: 310 returnlist = [(SPACE.join(plist), routeaddr)] 311 312 else: 313 if plist: 314 returnlist = [(SPACE.join(self.commentlist), plist[0])] 315 elif self.field[self.pos] in self.specials: 316 self.pos += 1 317 318 self.gotonext() 319 if self.pos < len(self.field) and self.field[self.pos] == ',': 320 self.pos += 1 321 return returnlist 322 323 def getrouteaddr(self): 324 """Parse a route address (Return-path value). 325 326 This method just skips all the route stuff and returns the addrspec. 327 """ 328 if self.field[self.pos] != '<': 329 return 330 331 expectroute = False 332 self.pos += 1 333 self.gotonext() 334 adlist = '' 335 while self.pos < len(self.field): 336 if expectroute: 337 self.getdomain() 338 expectroute = False 339 elif self.field[self.pos] == '>': 340 self.pos += 1 341 break 342 elif self.field[self.pos] == '@': 343 self.pos += 1 344 expectroute = True 345 elif self.field[self.pos] == ':': 346 self.pos += 1 347 else: 348 adlist = self.getaddrspec() 349 self.pos += 1 350 break 351 self.gotonext() 352 353 return adlist 354 355 def getaddrspec(self): 356 """Parse an RFC 2822 addr-spec.""" 357 aslist = [] 358 359 self.gotonext() 360 while self.pos < len(self.field): 361 preserve_ws = True 362 if self.field[self.pos] == '.': 363 if aslist and not aslist[-1].strip(): 364 aslist.pop() 365 aslist.append('.') 366 self.pos += 1 367 preserve_ws = False 368 elif self.field[self.pos] == '"': 369 aslist.append('"%s"' % quote(self.getquote())) 370 elif self.field[self.pos] in self.atomends: 371 if aslist and not aslist[-1].strip(): 372 aslist.pop() 373 break 374 else: 375 aslist.append(self.getatom()) 376 ws = self.gotonext() 377 if preserve_ws and ws: 378 aslist.append(ws) 379 380 if self.pos >= len(self.field) or self.field[self.pos] != '@': 381 return EMPTYSTRING.join(aslist) 382 383 aslist.append('@') 384 self.pos += 1 385 self.gotonext() 386 domain = self.getdomain() 387 if not domain: 388 # Invalid domain, return an empty address instead of returning a 389 # local part to denote failed parsing. 390 return EMPTYSTRING 391 return EMPTYSTRING.join(aslist) + domain 392 393 def getdomain(self): 394 """Get the complete domain name from an address.""" 395 sdlist = [] 396 while self.pos < len(self.field): 397 if self.field[self.pos] in self.LWS: 398 self.pos += 1 399 elif self.field[self.pos] == '(': 400 self.commentlist.append(self.getcomment()) 401 elif self.field[self.pos] == '[': 402 sdlist.append(self.getdomainliteral()) 403 elif self.field[self.pos] == '.': 404 self.pos += 1 405 sdlist.append('.') 406 elif self.field[self.pos] == '@': 407 # bpo-34155: Don't parse domains with two `@` like 408 # `a@malicious.org@important.com`. 409 return EMPTYSTRING 410 elif self.field[self.pos] in self.atomends: 411 break 412 else: 413 sdlist.append(self.getatom()) 414 return EMPTYSTRING.join(sdlist) 415 416 def getdelimited(self, beginchar, endchars, allowcomments=True): 417 """Parse a header fragment delimited by special characters. 418 419 `beginchar' is the start character for the fragment. 420 If self is not looking at an instance of `beginchar' then 421 getdelimited returns the empty string. 422 423 `endchars' is a sequence of allowable end-delimiting characters. 424 Parsing stops when one of these is encountered. 425 426 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 427 within the parsed fragment. 428 """ 429 if self.field[self.pos] != beginchar: 430 return '' 431 432 slist = [''] 433 quote = False 434 self.pos += 1 435 while self.pos < len(self.field): 436 if quote: 437 slist.append(self.field[self.pos]) 438 quote = False 439 elif self.field[self.pos] in endchars: 440 self.pos += 1 441 break 442 elif allowcomments and self.field[self.pos] == '(': 443 slist.append(self.getcomment()) 444 continue # have already advanced pos from getcomment 445 elif self.field[self.pos] == '\\': 446 quote = True 447 else: 448 slist.append(self.field[self.pos]) 449 self.pos += 1 450 451 return EMPTYSTRING.join(slist) 452 453 def getquote(self): 454 """Get a quote-delimited fragment from self's field.""" 455 return self.getdelimited('"', '"\r', False) 456 457 def getcomment(self): 458 """Get a parenthesis-delimited fragment from self's field.""" 459 return self.getdelimited('(', ')\r', True) 460 461 def getdomainliteral(self): 462 """Parse an RFC 2822 domain-literal.""" 463 return '[%s]' % self.getdelimited('[', ']\r', False) 464 465 def getatom(self, atomends=None): 466 """Parse an RFC 2822 atom. 467 468 Optional atomends specifies a different set of end token delimiters 469 (the default is to use self.atomends). This is used e.g. in 470 getphraselist() since phrase endings must not include the `.' (which 471 is legal in phrases).""" 472 atomlist = [''] 473 if atomends is None: 474 atomends = self.atomends 475 476 while self.pos < len(self.field): 477 if self.field[self.pos] in atomends: 478 break 479 else: 480 atomlist.append(self.field[self.pos]) 481 self.pos += 1 482 483 return EMPTYSTRING.join(atomlist) 484 485 def getphraselist(self): 486 """Parse a sequence of RFC 2822 phrases. 487 488 A phrase is a sequence of words, which are in turn either RFC 2822 489 atoms or quoted-strings. Phrases are canonicalized by squeezing all 490 runs of continuous whitespace into one space. 491 """ 492 plist = [] 493 494 while self.pos < len(self.field): 495 if self.field[self.pos] in self.FWS: 496 self.pos += 1 497 elif self.field[self.pos] == '"': 498 plist.append(self.getquote()) 499 elif self.field[self.pos] == '(': 500 self.commentlist.append(self.getcomment()) 501 elif self.field[self.pos] in self.phraseends: 502 break 503 else: 504 plist.append(self.getatom(self.phraseends)) 505 506 return plist 507 508class AddressList(AddrlistClass): 509 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 510 def __init__(self, field): 511 AddrlistClass.__init__(self, field) 512 if field: 513 self.addresslist = self.getaddrlist() 514 else: 515 self.addresslist = [] 516 517 def __len__(self): 518 return len(self.addresslist) 519 520 def __add__(self, other): 521 # Set union 522 newaddr = AddressList(None) 523 newaddr.addresslist = self.addresslist[:] 524 for x in other.addresslist: 525 if not x in self.addresslist: 526 newaddr.addresslist.append(x) 527 return newaddr 528 529 def __iadd__(self, other): 530 # Set union, in-place 531 for x in other.addresslist: 532 if not x in self.addresslist: 533 self.addresslist.append(x) 534 return self 535 536 def __sub__(self, other): 537 # Set difference 538 newaddr = AddressList(None) 539 for x in self.addresslist: 540 if not x in other.addresslist: 541 newaddr.addresslist.append(x) 542 return newaddr 543 544 def __isub__(self, other): 545 # Set difference, in-place 546 for x in other.addresslist: 547 if x in self.addresslist: 548 self.addresslist.remove(x) 549 return self 550 551 def __getitem__(self, index): 552 # Make indexing, slices, and 'in' work 553 return self.addresslist[index] 554