1# Copyright (C) 2001-2010 Python Software Foundation 2# Author: Barry Warsaw 3# Contact: email-sig@python.org 4 5"""Miscellaneous utilities.""" 6 7__all__ = [ 8 'collapse_rfc2231_value', 9 'decode_params', 10 'decode_rfc2231', 11 'encode_rfc2231', 12 'formataddr', 13 'formatdate', 14 'format_datetime', 15 'getaddresses', 16 'make_msgid', 17 'mktime_tz', 18 'parseaddr', 19 'parsedate', 20 'parsedate_tz', 21 'parsedate_to_datetime', 22 'unquote', 23 ] 24 25import os 26import re 27import time 28import datetime 29import urllib.parse 30 31from email._parseaddr import quote 32from email._parseaddr import AddressList as _AddressList 33from email._parseaddr import mktime_tz 34 35from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz 36 37COMMASPACE = ', ' 38EMPTYSTRING = '' 39UEMPTYSTRING = '' 40CRLF = '\r\n' 41TICK = "'" 42 43specialsre = re.compile(r'[][\\()<>@,:;".]') 44escapesre = re.compile(r'[\\"]') 45 46 47def _has_surrogates(s): 48 """Return True if s may contain surrogate-escaped binary data.""" 49 # This check is based on the fact that unless there are surrogates, utf8 50 # (Python's default encoding) can encode any string. This is the fastest 51 # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings. 52 try: 53 s.encode() 54 return False 55 except UnicodeEncodeError: 56 return True 57 58# How to deal with a string containing bytes before handing it to the 59# application through the 'normal' interface. 60def _sanitize(string): 61 # Turn any escaped bytes into unicode 'unknown' char. If the escaped 62 # bytes happen to be utf-8 they will instead get decoded, even if they 63 # were invalid in the charset the source was supposed to be in. This 64 # seems like it is not a bad thing; a defect was still registered. 65 original_bytes = string.encode('utf-8', 'surrogateescape') 66 return original_bytes.decode('utf-8', 'replace') 67 68 69 70# Helpers 71 72def formataddr(pair, charset='utf-8'): 73 """The inverse of parseaddr(), this takes a 2-tuple of the form 74 (realname, email_address) and returns the string value suitable 75 for an RFC 2822 From, To or Cc header. 76 77 If the first element of pair is false, then the second element is 78 returned unmodified. 79 80 The optional charset is the character set that is used to encode 81 realname in case realname is not ASCII safe. Can be an instance of str or 82 a Charset-like object which has a header_encode method. Default is 83 'utf-8'. 84 """ 85 name, address = pair 86 # The address MUST (per RFC) be ascii, so raise a UnicodeError if it isn't. 87 address.encode('ascii') 88 if name: 89 try: 90 name.encode('ascii') 91 except UnicodeEncodeError: 92 if isinstance(charset, str): 93 # lazy import to improve module import time 94 from email.charset import Charset 95 charset = Charset(charset) 96 encoded_name = charset.header_encode(name) 97 return "%s <%s>" % (encoded_name, address) 98 else: 99 quotes = '' 100 if specialsre.search(name): 101 quotes = '"' 102 name = escapesre.sub(r'\\\g<0>', name) 103 return '%s%s%s <%s>' % (quotes, name, quotes, address) 104 return address 105 106 107def _iter_escaped_chars(addr): 108 pos = 0 109 escape = False 110 for pos, ch in enumerate(addr): 111 if escape: 112 yield (pos, '\\' + ch) 113 escape = False 114 elif ch == '\\': 115 escape = True 116 else: 117 yield (pos, ch) 118 if escape: 119 yield (pos, '\\') 120 121 122def _strip_quoted_realnames(addr): 123 """Strip real names between quotes.""" 124 if '"' not in addr: 125 # Fast path 126 return addr 127 128 start = 0 129 open_pos = None 130 result = [] 131 for pos, ch in _iter_escaped_chars(addr): 132 if ch == '"': 133 if open_pos is None: 134 open_pos = pos 135 else: 136 if start != open_pos: 137 result.append(addr[start:open_pos]) 138 start = pos + 1 139 open_pos = None 140 141 if start < len(addr): 142 result.append(addr[start:]) 143 144 return ''.join(result) 145 146 147supports_strict_parsing = True 148 149def getaddresses(fieldvalues, *, strict=True): 150 """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue. 151 152 When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in 153 its place. 154 155 If strict is true, use a strict parser which rejects malformed inputs. 156 """ 157 158 # If strict is true, if the resulting list of parsed addresses is greater 159 # than the number of fieldvalues in the input list, a parsing error has 160 # occurred and consequently a list containing a single empty 2-tuple [('', 161 # '')] is returned in its place. This is done to avoid invalid output. 162 # 163 # Malformed input: getaddresses(['alice@example.com <bob@example.com>']) 164 # Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')] 165 # Safe output: [('', '')] 166 167 if not strict: 168 all = COMMASPACE.join(str(v) for v in fieldvalues) 169 a = _AddressList(all) 170 return a.addresslist 171 172 fieldvalues = [str(v) for v in fieldvalues] 173 fieldvalues = _pre_parse_validation(fieldvalues) 174 addr = COMMASPACE.join(fieldvalues) 175 a = _AddressList(addr) 176 result = _post_parse_validation(a.addresslist) 177 178 # Treat output as invalid if the number of addresses is not equal to the 179 # expected number of addresses. 180 n = 0 181 for v in fieldvalues: 182 # When a comma is used in the Real Name part it is not a deliminator. 183 # So strip those out before counting the commas. 184 v = _strip_quoted_realnames(v) 185 # Expected number of addresses: 1 + number of commas 186 n += 1 + v.count(',') 187 if len(result) != n: 188 return [('', '')] 189 190 return result 191 192 193def _check_parenthesis(addr): 194 # Ignore parenthesis in quoted real names. 195 addr = _strip_quoted_realnames(addr) 196 197 opens = 0 198 for pos, ch in _iter_escaped_chars(addr): 199 if ch == '(': 200 opens += 1 201 elif ch == ')': 202 opens -= 1 203 if opens < 0: 204 return False 205 return (opens == 0) 206 207 208def _pre_parse_validation(email_header_fields): 209 accepted_values = [] 210 for v in email_header_fields: 211 if not _check_parenthesis(v): 212 v = "('', '')" 213 accepted_values.append(v) 214 215 return accepted_values 216 217 218def _post_parse_validation(parsed_email_header_tuples): 219 accepted_values = [] 220 # The parser would have parsed a correctly formatted domain-literal 221 # The existence of an [ after parsing indicates a parsing failure 222 for v in parsed_email_header_tuples: 223 if '[' in v[1]: 224 v = ('', '') 225 accepted_values.append(v) 226 227 return accepted_values 228 229 230def _format_timetuple_and_zone(timetuple, zone): 231 return '%s, %02d %s %04d %02d:%02d:%02d %s' % ( 232 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]], 233 timetuple[2], 234 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 235 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1], 236 timetuple[0], timetuple[3], timetuple[4], timetuple[5], 237 zone) 238 239def formatdate(timeval=None, localtime=False, usegmt=False): 240 """Returns a date string as specified by RFC 2822, e.g.: 241 242 Fri, 09 Nov 2001 01:08:47 -0000 243 244 Optional timeval if given is a floating-point time value as accepted by 245 gmtime() and localtime(), otherwise the current time is used. 246 247 Optional localtime is a flag that when True, interprets timeval, and 248 returns a date relative to the local timezone instead of UTC, properly 249 taking daylight savings time into account. 250 251 Optional argument usegmt means that the timezone is written out as 252 an ascii string, not numeric one (so "GMT" instead of "+0000"). This 253 is needed for HTTP, and is only used when localtime==False. 254 """ 255 # Note: we cannot use strftime() because that honors the locale and RFC 256 # 2822 requires that day and month names be the English abbreviations. 257 if timeval is None: 258 timeval = time.time() 259 dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc) 260 261 if localtime: 262 dt = dt.astimezone() 263 usegmt = False 264 elif not usegmt: 265 dt = dt.replace(tzinfo=None) 266 return format_datetime(dt, usegmt) 267 268def format_datetime(dt, usegmt=False): 269 """Turn a datetime into a date string as specified in RFC 2822. 270 271 If usegmt is True, dt must be an aware datetime with an offset of zero. In 272 this case 'GMT' will be rendered instead of the normal +0000 required by 273 RFC2822. This is to support HTTP headers involving date stamps. 274 """ 275 now = dt.timetuple() 276 if usegmt: 277 if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc: 278 raise ValueError("usegmt option requires a UTC datetime") 279 zone = 'GMT' 280 elif dt.tzinfo is None: 281 zone = '-0000' 282 else: 283 zone = dt.strftime("%z") 284 return _format_timetuple_and_zone(now, zone) 285 286 287def make_msgid(idstring=None, domain=None): 288 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g: 289 290 <142480216486.20800.16526388040877946887@nightshade.la.mastaler.com> 291 292 Optional idstring if given is a string used to strengthen the 293 uniqueness of the message id. Optional domain if given provides the 294 portion of the message id after the '@'. It defaults to the locally 295 defined hostname. 296 """ 297 # Lazy imports to speedup module import time 298 # (no other functions in email.utils need these modules) 299 import random 300 import socket 301 302 timeval = int(time.time()*100) 303 pid = os.getpid() 304 randint = random.getrandbits(64) 305 if idstring is None: 306 idstring = '' 307 else: 308 idstring = '.' + idstring 309 if domain is None: 310 domain = socket.getfqdn() 311 msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain) 312 return msgid 313 314 315def parsedate_to_datetime(data): 316 parsed_date_tz = _parsedate_tz(data) 317 if parsed_date_tz is None: 318 raise ValueError('Invalid date value or format "%s"' % str(data)) 319 *dtuple, tz = parsed_date_tz 320 if tz is None: 321 return datetime.datetime(*dtuple[:6]) 322 return datetime.datetime(*dtuple[:6], 323 tzinfo=datetime.timezone(datetime.timedelta(seconds=tz))) 324 325 326def parseaddr(addr, *, strict=True): 327 """ 328 Parse addr into its constituent realname and email address parts. 329 330 Return a tuple of realname and email address, unless the parse fails, in 331 which case return a 2-tuple of ('', ''). 332 333 If strict is True, use a strict parser which rejects malformed inputs. 334 """ 335 if not strict: 336 addrs = _AddressList(addr).addresslist 337 if not addrs: 338 return ('', '') 339 return addrs[0] 340 341 if isinstance(addr, list): 342 addr = addr[0] 343 344 if not isinstance(addr, str): 345 return ('', '') 346 347 addr = _pre_parse_validation([addr])[0] 348 addrs = _post_parse_validation(_AddressList(addr).addresslist) 349 350 if not addrs or len(addrs) > 1: 351 return ('', '') 352 353 return addrs[0] 354 355 356# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3. 357def unquote(str): 358 """Remove quotes from a string.""" 359 if len(str) > 1: 360 if str.startswith('"') and str.endswith('"'): 361 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"') 362 if str.startswith('<') and str.endswith('>'): 363 return str[1:-1] 364 return str 365 366 367 368# RFC2231-related functions - parameter encoding and decoding 369def decode_rfc2231(s): 370 """Decode string according to RFC 2231""" 371 parts = s.split(TICK, 2) 372 if len(parts) <= 2: 373 return None, None, s 374 return parts 375 376 377def encode_rfc2231(s, charset=None, language=None): 378 """Encode string according to RFC 2231. 379 380 If neither charset nor language is given, then s is returned as-is. If 381 charset is given but not language, the string is encoded using the empty 382 string for language. 383 """ 384 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii') 385 if charset is None and language is None: 386 return s 387 if language is None: 388 language = '' 389 return "%s'%s'%s" % (charset, language, s) 390 391 392rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$', 393 re.ASCII) 394 395def decode_params(params): 396 """Decode parameters list according to RFC 2231. 397 398 params is a sequence of 2-tuples containing (param name, string value). 399 """ 400 new_params = [params[0]] 401 # Map parameter's name to a list of continuations. The values are a 402 # 3-tuple of the continuation number, the string value, and a flag 403 # specifying whether a particular segment is %-encoded. 404 rfc2231_params = {} 405 for name, value in params[1:]: 406 encoded = name.endswith('*') 407 value = unquote(value) 408 mo = rfc2231_continuation.match(name) 409 if mo: 410 name, num = mo.group('name', 'num') 411 if num is not None: 412 num = int(num) 413 rfc2231_params.setdefault(name, []).append((num, value, encoded)) 414 else: 415 new_params.append((name, '"%s"' % quote(value))) 416 if rfc2231_params: 417 for name, continuations in rfc2231_params.items(): 418 value = [] 419 extended = False 420 # Sort by number 421 continuations.sort() 422 # And now append all values in numerical order, converting 423 # %-encodings for the encoded segments. If any of the 424 # continuation names ends in a *, then the entire string, after 425 # decoding segments and concatenating, must have the charset and 426 # language specifiers at the beginning of the string. 427 for num, s, encoded in continuations: 428 if encoded: 429 # Decode as "latin-1", so the characters in s directly 430 # represent the percent-encoded octet values. 431 # collapse_rfc2231_value treats this as an octet sequence. 432 s = urllib.parse.unquote(s, encoding="latin-1") 433 extended = True 434 value.append(s) 435 value = quote(EMPTYSTRING.join(value)) 436 if extended: 437 charset, language, value = decode_rfc2231(value) 438 new_params.append((name, (charset, language, '"%s"' % value))) 439 else: 440 new_params.append((name, '"%s"' % value)) 441 return new_params 442 443def collapse_rfc2231_value(value, errors='replace', 444 fallback_charset='us-ascii'): 445 if not isinstance(value, tuple) or len(value) != 3: 446 return unquote(value) 447 # While value comes to us as a unicode string, we need it to be a bytes 448 # object. We do not want bytes() normal utf-8 decoder, we want a straight 449 # interpretation of the string as character bytes. 450 charset, language, text = value 451 if charset is None: 452 # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse 453 # the value, so use the fallback_charset. 454 charset = fallback_charset 455 rawbytes = bytes(text, 'raw-unicode-escape') 456 try: 457 return str(rawbytes, charset, errors) 458 except LookupError: 459 # charset is not a known codec. 460 return unquote(text) 461 462 463# 464# datetime doesn't provide a localtime function yet, so provide one. Code 465# adapted from the patch in issue 9527. This may not be perfect, but it is 466# better than not having it. 467# 468 469def localtime(dt=None, isdst=None): 470 """Return local time as an aware datetime object. 471 472 If called without arguments, return current time. Otherwise *dt* 473 argument should be a datetime instance, and it is converted to the 474 local time zone according to the system time zone database. If *dt* is 475 naive (that is, dt.tzinfo is None), it is assumed to be in local time. 476 The isdst parameter is ignored. 477 478 """ 479 if isdst is not None: 480 import warnings 481 warnings._deprecated( 482 "The 'isdst' parameter to 'localtime'", 483 message='{name} is deprecated and slated for removal in Python {remove}', 484 remove=(3, 14), 485 ) 486 if dt is None: 487 dt = datetime.datetime.now() 488 return dt.astimezone() 489