1# Copyright (C) 2002-2007 Python Software Foundation 2# Contact: email-sig@python.org 3 4"""Email address parsing code. 5 6Lifted directly from rfc822.py. This should eventually be rewritten. 7""" 8 9__all__ = [ 10 'mktime_tz', 11 'parsedate', 12 'parsedate_tz', 13 'quote', 14 ] 15 16import time, calendar 17 18SPACE = ' ' 19EMPTYSTRING = '' 20COMMASPACE = ', ' 21 22# Parse a date field 23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 24 'aug', 'sep', 'oct', 'nov', 'dec', 25 'january', 'february', 'march', 'april', 'may', 'june', 'july', 26 'august', 'september', 'october', 'november', 'december'] 27 28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 29 30# The timezone table does not include the military time zones defined 31# in RFC822, other than Z. According to RFC1123, the description in 32# RFC822 gets the signs wrong, so we can't rely on any such time 33# zones. RFC1123 recommends that numeric timezone indicators be used 34# instead of timezone names. 35 36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 38 'EST': -500, 'EDT': -400, # Eastern 39 'CST': -600, 'CDT': -500, # Central 40 'MST': -700, 'MDT': -600, # Mountain 41 'PST': -800, 'PDT': -700 # Pacific 42 } 43 44 45def parsedate_tz(data): 46 """Convert a date string to a time tuple. 47 48 Accounts for military timezones. 49 """ 50 res = _parsedate_tz(data) 51 if not res: 52 return 53 if res[9] is None: 54 res[9] = 0 55 return tuple(res) 56 57def _parsedate_tz(data): 58 """Convert date to extended time tuple. 59 60 The last (additional) element is the time zone offset in seconds, except if 61 the timezone was specified as -0000. In that case the last element is 62 None. This indicates a UTC timestamp that explicitly declaims knowledge of 63 the source timezone, as opposed to a +0000 timestamp that indicates the 64 source timezone really was UTC. 65 66 """ 67 if not data: 68 return 69 data = data.split() 70 # The FWS after the comma after the day-of-week is optional, so search and 71 # adjust for this. 72 if data[0].endswith(',') or data[0].lower() in _daynames: 73 # There's a dayname here. Skip it 74 del data[0] 75 else: 76 i = data[0].rfind(',') 77 if i >= 0: 78 data[0] = data[0][i+1:] 79 if len(data) == 3: # RFC 850 date, deprecated 80 stuff = data[0].split('-') 81 if len(stuff) == 3: 82 data = stuff + data[1:] 83 if len(data) == 4: 84 s = data[3] 85 i = s.find('+') 86 if i == -1: 87 i = s.find('-') 88 if i > 0: 89 data[3:] = [s[:i], s[i:]] 90 else: 91 data.append('') # Dummy tz 92 if len(data) < 5: 93 return None 94 data = data[:5] 95 [dd, mm, yy, tm, tz] = data 96 mm = mm.lower() 97 if mm not in _monthnames: 98 dd, mm = mm, dd.lower() 99 if mm not in _monthnames: 100 return None 101 mm = _monthnames.index(mm) + 1 102 if mm > 12: 103 mm -= 12 104 if dd[-1] == ',': 105 dd = dd[:-1] 106 i = yy.find(':') 107 if i > 0: 108 yy, tm = tm, yy 109 if yy[-1] == ',': 110 yy = yy[:-1] 111 if not yy[0].isdigit(): 112 yy, tz = tz, yy 113 if tm[-1] == ',': 114 tm = tm[:-1] 115 tm = tm.split(':') 116 if len(tm) == 2: 117 [thh, tmm] = tm 118 tss = '0' 119 elif len(tm) == 3: 120 [thh, tmm, tss] = tm 121 elif len(tm) == 1 and '.' in tm[0]: 122 # Some non-compliant MUAs use '.' to separate time elements. 123 tm = tm[0].split('.') 124 if len(tm) == 2: 125 [thh, tmm] = tm 126 tss = 0 127 elif len(tm) == 3: 128 [thh, tmm, tss] = tm 129 else: 130 return None 131 try: 132 yy = int(yy) 133 dd = int(dd) 134 thh = int(thh) 135 tmm = int(tmm) 136 tss = int(tss) 137 except ValueError: 138 return None 139 # Check for a yy specified in two-digit format, then convert it to the 140 # appropriate four-digit format, according to the POSIX standard. RFC 822 141 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 142 # mandates a 4-digit yy. For more information, see the documentation for 143 # the time module. 144 if yy < 100: 145 # The year is between 1969 and 1999 (inclusive). 146 if yy > 68: 147 yy += 1900 148 # The year is between 2000 and 2068 (inclusive). 149 else: 150 yy += 2000 151 tzoffset = None 152 tz = tz.upper() 153 if tz in _timezones: 154 tzoffset = _timezones[tz] 155 else: 156 try: 157 tzoffset = int(tz) 158 except ValueError: 159 pass 160 if tzoffset==0 and tz.startswith('-'): 161 tzoffset = None 162 # Convert a timezone offset into seconds ; -0500 -> -18000 163 if tzoffset: 164 if tzoffset < 0: 165 tzsign = -1 166 tzoffset = -tzoffset 167 else: 168 tzsign = 1 169 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 170 # Daylight Saving Time flag is set to -1, since DST is unknown. 171 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] 172 173 174def parsedate(data): 175 """Convert a time string to a time tuple.""" 176 t = parsedate_tz(data) 177 if isinstance(t, tuple): 178 return t[:9] 179 else: 180 return t 181 182 183def mktime_tz(data): 184 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 185 if data[9] is None: 186 # No zone info, so localtime is better assumption than GMT 187 return time.mktime(data[:8] + (-1,)) 188 else: 189 t = calendar.timegm(data) 190 return t - data[9] 191 192 193def quote(str): 194 """Prepare string to be used in a quoted string. 195 196 Turns backslash and double quote characters into quoted pairs. These 197 are the only characters that need to be quoted inside a quoted string. 198 Does not add the surrounding double quotes. 199 """ 200 return str.replace('\\', '\\\\').replace('"', '\\"') 201 202 203class AddrlistClass: 204 """Address parser class by Ben Escoto. 205 206 To understand what this class does, it helps to have a copy of RFC 2822 in 207 front of you. 208 209 Note: this class interface is deprecated and may be removed in the future. 210 Use email.utils.AddressList instead. 211 """ 212 213 def __init__(self, field): 214 """Initialize a new instance. 215 216 `field' is an unparsed address header field, containing 217 one or more addresses. 218 """ 219 self.specials = '()<>@,:;.\"[]' 220 self.pos = 0 221 self.LWS = ' \t' 222 self.CR = '\r\n' 223 self.FWS = self.LWS + self.CR 224 self.atomends = self.specials + self.LWS + self.CR 225 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 226 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 227 # syntax, so allow dots in phrases. 228 self.phraseends = self.atomends.replace('.', '') 229 self.field = field 230 self.commentlist = [] 231 232 def gotonext(self): 233 """Skip white space and extract comments.""" 234 wslist = [] 235 while self.pos < len(self.field): 236 if self.field[self.pos] in self.LWS + '\n\r': 237 if self.field[self.pos] not in '\n\r': 238 wslist.append(self.field[self.pos]) 239 self.pos += 1 240 elif self.field[self.pos] == '(': 241 self.commentlist.append(self.getcomment()) 242 else: 243 break 244 return EMPTYSTRING.join(wslist) 245 246 def getaddrlist(self): 247 """Parse all addresses. 248 249 Returns a list containing all of the addresses. 250 """ 251 result = [] 252 while self.pos < len(self.field): 253 ad = self.getaddress() 254 if ad: 255 result += ad 256 else: 257 result.append(('', '')) 258 return result 259 260 def getaddress(self): 261 """Parse the next address.""" 262 self.commentlist = [] 263 self.gotonext() 264 265 oldpos = self.pos 266 oldcl = self.commentlist 267 plist = self.getphraselist() 268 269 self.gotonext() 270 returnlist = [] 271 272 if self.pos >= len(self.field): 273 # Bad email address technically, no domain. 274 if plist: 275 returnlist = [(SPACE.join(self.commentlist), plist[0])] 276 277 elif self.field[self.pos] in '.@': 278 # email address is just an addrspec 279 # this isn't very efficient since we start over 280 self.pos = oldpos 281 self.commentlist = oldcl 282 addrspec = self.getaddrspec() 283 returnlist = [(SPACE.join(self.commentlist), addrspec)] 284 285 elif self.field[self.pos] == ':': 286 # address is a group 287 returnlist = [] 288 289 fieldlen = len(self.field) 290 self.pos += 1 291 while self.pos < len(self.field): 292 self.gotonext() 293 if self.pos < fieldlen and self.field[self.pos] == ';': 294 self.pos += 1 295 break 296 returnlist = returnlist + self.getaddress() 297 298 elif self.field[self.pos] == '<': 299 # Address is a phrase then a route addr 300 routeaddr = self.getrouteaddr() 301 302 if self.commentlist: 303 returnlist = [(SPACE.join(plist) + ' (' + 304 ' '.join(self.commentlist) + ')', routeaddr)] 305 else: 306 returnlist = [(SPACE.join(plist), routeaddr)] 307 308 else: 309 if plist: 310 returnlist = [(SPACE.join(self.commentlist), plist[0])] 311 elif self.field[self.pos] in self.specials: 312 self.pos += 1 313 314 self.gotonext() 315 if self.pos < len(self.field) and self.field[self.pos] == ',': 316 self.pos += 1 317 return returnlist 318 319 def getrouteaddr(self): 320 """Parse a route address (Return-path value). 321 322 This method just skips all the route stuff and returns the addrspec. 323 """ 324 if self.field[self.pos] != '<': 325 return 326 327 expectroute = False 328 self.pos += 1 329 self.gotonext() 330 adlist = '' 331 while self.pos < len(self.field): 332 if expectroute: 333 self.getdomain() 334 expectroute = False 335 elif self.field[self.pos] == '>': 336 self.pos += 1 337 break 338 elif self.field[self.pos] == '@': 339 self.pos += 1 340 expectroute = True 341 elif self.field[self.pos] == ':': 342 self.pos += 1 343 else: 344 adlist = self.getaddrspec() 345 self.pos += 1 346 break 347 self.gotonext() 348 349 return adlist 350 351 def getaddrspec(self): 352 """Parse an RFC 2822 addr-spec.""" 353 aslist = [] 354 355 self.gotonext() 356 while self.pos < len(self.field): 357 preserve_ws = True 358 if self.field[self.pos] == '.': 359 if aslist and not aslist[-1].strip(): 360 aslist.pop() 361 aslist.append('.') 362 self.pos += 1 363 preserve_ws = False 364 elif self.field[self.pos] == '"': 365 aslist.append('"%s"' % quote(self.getquote())) 366 elif self.field[self.pos] in self.atomends: 367 if aslist and not aslist[-1].strip(): 368 aslist.pop() 369 break 370 else: 371 aslist.append(self.getatom()) 372 ws = self.gotonext() 373 if preserve_ws and ws: 374 aslist.append(ws) 375 376 if self.pos >= len(self.field) or self.field[self.pos] != '@': 377 return EMPTYSTRING.join(aslist) 378 379 aslist.append('@') 380 self.pos += 1 381 self.gotonext() 382 domain = self.getdomain() 383 if not domain: 384 # Invalid domain, return an empty address instead of returning a 385 # local part to denote failed parsing. 386 return EMPTYSTRING 387 return EMPTYSTRING.join(aslist) + domain 388 389 def getdomain(self): 390 """Get the complete domain name from an address.""" 391 sdlist = [] 392 while self.pos < len(self.field): 393 if self.field[self.pos] in self.LWS: 394 self.pos += 1 395 elif self.field[self.pos] == '(': 396 self.commentlist.append(self.getcomment()) 397 elif self.field[self.pos] == '[': 398 sdlist.append(self.getdomainliteral()) 399 elif self.field[self.pos] == '.': 400 self.pos += 1 401 sdlist.append('.') 402 elif self.field[self.pos] == '@': 403 # bpo-34155: Don't parse domains with two `@` like 404 # `a@malicious.org@important.com`. 405 return EMPTYSTRING 406 elif self.field[self.pos] in self.atomends: 407 break 408 else: 409 sdlist.append(self.getatom()) 410 return EMPTYSTRING.join(sdlist) 411 412 def getdelimited(self, beginchar, endchars, allowcomments=True): 413 """Parse a header fragment delimited by special characters. 414 415 `beginchar' is the start character for the fragment. 416 If self is not looking at an instance of `beginchar' then 417 getdelimited returns the empty string. 418 419 `endchars' is a sequence of allowable end-delimiting characters. 420 Parsing stops when one of these is encountered. 421 422 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 423 within the parsed fragment. 424 """ 425 if self.field[self.pos] != beginchar: 426 return '' 427 428 slist = [''] 429 quote = False 430 self.pos += 1 431 while self.pos < len(self.field): 432 if quote: 433 slist.append(self.field[self.pos]) 434 quote = False 435 elif self.field[self.pos] in endchars: 436 self.pos += 1 437 break 438 elif allowcomments and self.field[self.pos] == '(': 439 slist.append(self.getcomment()) 440 continue # have already advanced pos from getcomment 441 elif self.field[self.pos] == '\\': 442 quote = True 443 else: 444 slist.append(self.field[self.pos]) 445 self.pos += 1 446 447 return EMPTYSTRING.join(slist) 448 449 def getquote(self): 450 """Get a quote-delimited fragment from self's field.""" 451 return self.getdelimited('"', '"\r', False) 452 453 def getcomment(self): 454 """Get a parenthesis-delimited fragment from self's field.""" 455 return self.getdelimited('(', ')\r', True) 456 457 def getdomainliteral(self): 458 """Parse an RFC 2822 domain-literal.""" 459 return '[%s]' % self.getdelimited('[', ']\r', False) 460 461 def getatom(self, atomends=None): 462 """Parse an RFC 2822 atom. 463 464 Optional atomends specifies a different set of end token delimiters 465 (the default is to use self.atomends). This is used e.g. in 466 getphraselist() since phrase endings must not include the `.' (which 467 is legal in phrases).""" 468 atomlist = [''] 469 if atomends is None: 470 atomends = self.atomends 471 472 while self.pos < len(self.field): 473 if self.field[self.pos] in atomends: 474 break 475 else: 476 atomlist.append(self.field[self.pos]) 477 self.pos += 1 478 479 return EMPTYSTRING.join(atomlist) 480 481 def getphraselist(self): 482 """Parse a sequence of RFC 2822 phrases. 483 484 A phrase is a sequence of words, which are in turn either RFC 2822 485 atoms or quoted-strings. Phrases are canonicalized by squeezing all 486 runs of continuous whitespace into one space. 487 """ 488 plist = [] 489 490 while self.pos < len(self.field): 491 if self.field[self.pos] in self.FWS: 492 self.pos += 1 493 elif self.field[self.pos] == '"': 494 plist.append(self.getquote()) 495 elif self.field[self.pos] == '(': 496 self.commentlist.append(self.getcomment()) 497 elif self.field[self.pos] in self.phraseends: 498 break 499 else: 500 plist.append(self.getatom(self.phraseends)) 501 502 return plist 503 504class AddressList(AddrlistClass): 505 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 506 def __init__(self, field): 507 AddrlistClass.__init__(self, field) 508 if field: 509 self.addresslist = self.getaddrlist() 510 else: 511 self.addresslist = [] 512 513 def __len__(self): 514 return len(self.addresslist) 515 516 def __add__(self, other): 517 # Set union 518 newaddr = AddressList(None) 519 newaddr.addresslist = self.addresslist[:] 520 for x in other.addresslist: 521 if not x in self.addresslist: 522 newaddr.addresslist.append(x) 523 return newaddr 524 525 def __iadd__(self, other): 526 # Set union, in-place 527 for x in other.addresslist: 528 if not x in self.addresslist: 529 self.addresslist.append(x) 530 return self 531 532 def __sub__(self, other): 533 # Set difference 534 newaddr = AddressList(None) 535 for x in self.addresslist: 536 if not x in other.addresslist: 537 newaddr.addresslist.append(x) 538 return newaddr 539 540 def __isub__(self, other): 541 # Set difference, in-place 542 for x in other.addresslist: 543 if x in self.addresslist: 544 self.addresslist.remove(x) 545 return self 546 547 def __getitem__(self, index): 548 # Make indexing, slices, and 'in' work 549 return self.addresslist[index] 550