• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (C) 2001-2010 Python Software Foundation
2# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8    'collapse_rfc2231_value',
9    'decode_params',
10    'decode_rfc2231',
11    'encode_rfc2231',
12    'formataddr',
13    'formatdate',
14    'format_datetime',
15    'getaddresses',
16    'make_msgid',
17    'mktime_tz',
18    'parseaddr',
19    'parsedate',
20    'parsedate_tz',
21    'parsedate_to_datetime',
22    'unquote',
23    ]
24
25import os
26import re
27import time
28import datetime
29import urllib.parse
30
31from email._parseaddr import quote
32from email._parseaddr import AddressList as _AddressList
33from email._parseaddr import mktime_tz
34
35from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz
36
37COMMASPACE = ', '
38EMPTYSTRING = ''
39UEMPTYSTRING = ''
40CRLF = '\r\n'
41TICK = "'"
42
43specialsre = re.compile(r'[][\\()<>@,:;".]')
44escapesre = re.compile(r'[\\"]')
45
46
47def _has_surrogates(s):
48    """Return True if s may contain surrogate-escaped binary data."""
49    # This check is based on the fact that unless there are surrogates, utf8
50    # (Python's default encoding) can encode any string.  This is the fastest
51    # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
52    try:
53        s.encode()
54        return False
55    except UnicodeEncodeError:
56        return True
57
58# How to deal with a string containing bytes before handing it to the
59# application through the 'normal' interface.
60def _sanitize(string):
61    # Turn any escaped bytes into unicode 'unknown' char.  If the escaped
62    # bytes happen to be utf-8 they will instead get decoded, even if they
63    # were invalid in the charset the source was supposed to be in.  This
64    # seems like it is not a bad thing; a defect was still registered.
65    original_bytes = string.encode('utf-8', 'surrogateescape')
66    return original_bytes.decode('utf-8', 'replace')
67
68
69
70# Helpers
71
72def formataddr(pair, charset='utf-8'):
73    """The inverse of parseaddr(), this takes a 2-tuple of the form
74    (realname, email_address) and returns the string value suitable
75    for an RFC 2822 From, To or Cc header.
76
77    If the first element of pair is false, then the second element is
78    returned unmodified.
79
80    The optional charset is the character set that is used to encode
81    realname in case realname is not ASCII safe.  Can be an instance of str or
82    a Charset-like object which has a header_encode method.  Default is
83    'utf-8'.
84    """
85    name, address = pair
86    # The address MUST (per RFC) be ascii, so raise a UnicodeError if it isn't.
87    address.encode('ascii')
88    if name:
89        try:
90            name.encode('ascii')
91        except UnicodeEncodeError:
92            if isinstance(charset, str):
93                # lazy import to improve module import time
94                from email.charset import Charset
95                charset = Charset(charset)
96            encoded_name = charset.header_encode(name)
97            return "%s <%s>" % (encoded_name, address)
98        else:
99            quotes = ''
100            if specialsre.search(name):
101                quotes = '"'
102            name = escapesre.sub(r'\\\g<0>', name)
103            return '%s%s%s <%s>' % (quotes, name, quotes, address)
104    return address
105
106
107def _iter_escaped_chars(addr):
108    pos = 0
109    escape = False
110    for pos, ch in enumerate(addr):
111        if escape:
112            yield (pos, '\\' + ch)
113            escape = False
114        elif ch == '\\':
115            escape = True
116        else:
117            yield (pos, ch)
118    if escape:
119        yield (pos, '\\')
120
121
122def _strip_quoted_realnames(addr):
123    """Strip real names between quotes."""
124    if '"' not in addr:
125        # Fast path
126        return addr
127
128    start = 0
129    open_pos = None
130    result = []
131    for pos, ch in _iter_escaped_chars(addr):
132        if ch == '"':
133            if open_pos is None:
134                open_pos = pos
135            else:
136                if start != open_pos:
137                    result.append(addr[start:open_pos])
138                start = pos + 1
139                open_pos = None
140
141    if start < len(addr):
142        result.append(addr[start:])
143
144    return ''.join(result)
145
146
147supports_strict_parsing = True
148
149def getaddresses(fieldvalues, *, strict=True):
150    """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue.
151
152    When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in
153    its place.
154
155    If strict is true, use a strict parser which rejects malformed inputs.
156    """
157
158    # If strict is true, if the resulting list of parsed addresses is greater
159    # than the number of fieldvalues in the input list, a parsing error has
160    # occurred and consequently a list containing a single empty 2-tuple [('',
161    # '')] is returned in its place. This is done to avoid invalid output.
162    #
163    # Malformed input: getaddresses(['alice@example.com <bob@example.com>'])
164    # Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')]
165    # Safe output: [('', '')]
166
167    if not strict:
168        all = COMMASPACE.join(str(v) for v in fieldvalues)
169        a = _AddressList(all)
170        return a.addresslist
171
172    fieldvalues = [str(v) for v in fieldvalues]
173    fieldvalues = _pre_parse_validation(fieldvalues)
174    addr = COMMASPACE.join(fieldvalues)
175    a = _AddressList(addr)
176    result = _post_parse_validation(a.addresslist)
177
178    # Treat output as invalid if the number of addresses is not equal to the
179    # expected number of addresses.
180    n = 0
181    for v in fieldvalues:
182        # When a comma is used in the Real Name part it is not a deliminator.
183        # So strip those out before counting the commas.
184        v = _strip_quoted_realnames(v)
185        # Expected number of addresses: 1 + number of commas
186        n += 1 + v.count(',')
187    if len(result) != n:
188        return [('', '')]
189
190    return result
191
192
193def _check_parenthesis(addr):
194    # Ignore parenthesis in quoted real names.
195    addr = _strip_quoted_realnames(addr)
196
197    opens = 0
198    for pos, ch in _iter_escaped_chars(addr):
199        if ch == '(':
200            opens += 1
201        elif ch == ')':
202            opens -= 1
203            if opens < 0:
204                return False
205    return (opens == 0)
206
207
208def _pre_parse_validation(email_header_fields):
209    accepted_values = []
210    for v in email_header_fields:
211        if not _check_parenthesis(v):
212            v = "('', '')"
213        accepted_values.append(v)
214
215    return accepted_values
216
217
218def _post_parse_validation(parsed_email_header_tuples):
219    accepted_values = []
220    # The parser would have parsed a correctly formatted domain-literal
221    # The existence of an [ after parsing indicates a parsing failure
222    for v in parsed_email_header_tuples:
223        if '[' in v[1]:
224            v = ('', '')
225        accepted_values.append(v)
226
227    return accepted_values
228
229
230def _format_timetuple_and_zone(timetuple, zone):
231    return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
232        ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
233        timetuple[2],
234        ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
235         'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
236        timetuple[0], timetuple[3], timetuple[4], timetuple[5],
237        zone)
238
239def formatdate(timeval=None, localtime=False, usegmt=False):
240    """Returns a date string as specified by RFC 2822, e.g.:
241
242    Fri, 09 Nov 2001 01:08:47 -0000
243
244    Optional timeval if given is a floating-point time value as accepted by
245    gmtime() and localtime(), otherwise the current time is used.
246
247    Optional localtime is a flag that when True, interprets timeval, and
248    returns a date relative to the local timezone instead of UTC, properly
249    taking daylight savings time into account.
250
251    Optional argument usegmt means that the timezone is written out as
252    an ascii string, not numeric one (so "GMT" instead of "+0000"). This
253    is needed for HTTP, and is only used when localtime==False.
254    """
255    # Note: we cannot use strftime() because that honors the locale and RFC
256    # 2822 requires that day and month names be the English abbreviations.
257    if timeval is None:
258        timeval = time.time()
259    dt = datetime.datetime.fromtimestamp(timeval, datetime.timezone.utc)
260
261    if localtime:
262        dt = dt.astimezone()
263        usegmt = False
264    elif not usegmt:
265        dt = dt.replace(tzinfo=None)
266    return format_datetime(dt, usegmt)
267
268def format_datetime(dt, usegmt=False):
269    """Turn a datetime into a date string as specified in RFC 2822.
270
271    If usegmt is True, dt must be an aware datetime with an offset of zero.  In
272    this case 'GMT' will be rendered instead of the normal +0000 required by
273    RFC2822.  This is to support HTTP headers involving date stamps.
274    """
275    now = dt.timetuple()
276    if usegmt:
277        if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
278            raise ValueError("usegmt option requires a UTC datetime")
279        zone = 'GMT'
280    elif dt.tzinfo is None:
281        zone = '-0000'
282    else:
283        zone = dt.strftime("%z")
284    return _format_timetuple_and_zone(now, zone)
285
286
287def make_msgid(idstring=None, domain=None):
288    """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
289
290    <142480216486.20800.16526388040877946887@nightshade.la.mastaler.com>
291
292    Optional idstring if given is a string used to strengthen the
293    uniqueness of the message id.  Optional domain if given provides the
294    portion of the message id after the '@'.  It defaults to the locally
295    defined hostname.
296    """
297    # Lazy imports to speedup module import time
298    # (no other functions in email.utils need these modules)
299    import random
300    import socket
301
302    timeval = int(time.time()*100)
303    pid = os.getpid()
304    randint = random.getrandbits(64)
305    if idstring is None:
306        idstring = ''
307    else:
308        idstring = '.' + idstring
309    if domain is None:
310        domain = socket.getfqdn()
311    msgid = '<%d.%d.%d%s@%s>' % (timeval, pid, randint, idstring, domain)
312    return msgid
313
314
315def parsedate_to_datetime(data):
316    parsed_date_tz = _parsedate_tz(data)
317    if parsed_date_tz is None:
318        raise ValueError('Invalid date value or format "%s"' % str(data))
319    *dtuple, tz = parsed_date_tz
320    if tz is None:
321        return datetime.datetime(*dtuple[:6])
322    return datetime.datetime(*dtuple[:6],
323            tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
324
325
326def parseaddr(addr, *, strict=True):
327    """
328    Parse addr into its constituent realname and email address parts.
329
330    Return a tuple of realname and email address, unless the parse fails, in
331    which case return a 2-tuple of ('', '').
332
333    If strict is True, use a strict parser which rejects malformed inputs.
334    """
335    if not strict:
336        addrs = _AddressList(addr).addresslist
337        if not addrs:
338            return ('', '')
339        return addrs[0]
340
341    if isinstance(addr, list):
342        addr = addr[0]
343
344    if not isinstance(addr, str):
345        return ('', '')
346
347    addr = _pre_parse_validation([addr])[0]
348    addrs = _post_parse_validation(_AddressList(addr).addresslist)
349
350    if not addrs or len(addrs) > 1:
351        return ('', '')
352
353    return addrs[0]
354
355
356# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
357def unquote(str):
358    """Remove quotes from a string."""
359    if len(str) > 1:
360        if str.startswith('"') and str.endswith('"'):
361            return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
362        if str.startswith('<') and str.endswith('>'):
363            return str[1:-1]
364    return str
365
366
367
368# RFC2231-related functions - parameter encoding and decoding
369def decode_rfc2231(s):
370    """Decode string according to RFC 2231"""
371    parts = s.split(TICK, 2)
372    if len(parts) <= 2:
373        return None, None, s
374    return parts
375
376
377def encode_rfc2231(s, charset=None, language=None):
378    """Encode string according to RFC 2231.
379
380    If neither charset nor language is given, then s is returned as-is.  If
381    charset is given but not language, the string is encoded using the empty
382    string for language.
383    """
384    s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
385    if charset is None and language is None:
386        return s
387    if language is None:
388        language = ''
389    return "%s'%s'%s" % (charset, language, s)
390
391
392rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
393    re.ASCII)
394
395def decode_params(params):
396    """Decode parameters list according to RFC 2231.
397
398    params is a sequence of 2-tuples containing (param name, string value).
399    """
400    new_params = [params[0]]
401    # Map parameter's name to a list of continuations.  The values are a
402    # 3-tuple of the continuation number, the string value, and a flag
403    # specifying whether a particular segment is %-encoded.
404    rfc2231_params = {}
405    for name, value in params[1:]:
406        encoded = name.endswith('*')
407        value = unquote(value)
408        mo = rfc2231_continuation.match(name)
409        if mo:
410            name, num = mo.group('name', 'num')
411            if num is not None:
412                num = int(num)
413            rfc2231_params.setdefault(name, []).append((num, value, encoded))
414        else:
415            new_params.append((name, '"%s"' % quote(value)))
416    if rfc2231_params:
417        for name, continuations in rfc2231_params.items():
418            value = []
419            extended = False
420            # Sort by number
421            continuations.sort()
422            # And now append all values in numerical order, converting
423            # %-encodings for the encoded segments.  If any of the
424            # continuation names ends in a *, then the entire string, after
425            # decoding segments and concatenating, must have the charset and
426            # language specifiers at the beginning of the string.
427            for num, s, encoded in continuations:
428                if encoded:
429                    # Decode as "latin-1", so the characters in s directly
430                    # represent the percent-encoded octet values.
431                    # collapse_rfc2231_value treats this as an octet sequence.
432                    s = urllib.parse.unquote(s, encoding="latin-1")
433                    extended = True
434                value.append(s)
435            value = quote(EMPTYSTRING.join(value))
436            if extended:
437                charset, language, value = decode_rfc2231(value)
438                new_params.append((name, (charset, language, '"%s"' % value)))
439            else:
440                new_params.append((name, '"%s"' % value))
441    return new_params
442
443def collapse_rfc2231_value(value, errors='replace',
444                           fallback_charset='us-ascii'):
445    if not isinstance(value, tuple) or len(value) != 3:
446        return unquote(value)
447    # While value comes to us as a unicode string, we need it to be a bytes
448    # object.  We do not want bytes() normal utf-8 decoder, we want a straight
449    # interpretation of the string as character bytes.
450    charset, language, text = value
451    if charset is None:
452        # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse
453        # the value, so use the fallback_charset.
454        charset = fallback_charset
455    rawbytes = bytes(text, 'raw-unicode-escape')
456    try:
457        return str(rawbytes, charset, errors)
458    except LookupError:
459        # charset is not a known codec.
460        return unquote(text)
461
462
463#
464# datetime doesn't provide a localtime function yet, so provide one.  Code
465# adapted from the patch in issue 9527.  This may not be perfect, but it is
466# better than not having it.
467#
468
469def localtime(dt=None, isdst=None):
470    """Return local time as an aware datetime object.
471
472    If called without arguments, return current time.  Otherwise *dt*
473    argument should be a datetime instance, and it is converted to the
474    local time zone according to the system time zone database.  If *dt* is
475    naive (that is, dt.tzinfo is None), it is assumed to be in local time.
476    The isdst parameter is ignored.
477
478    """
479    if isdst is not None:
480        import warnings
481        warnings._deprecated(
482            "The 'isdst' parameter to 'localtime'",
483            message='{name} is deprecated and slated for removal in Python {remove}',
484            remove=(3, 14),
485            )
486    if dt is None:
487        dt = datetime.datetime.now()
488    return dt.astimezone()
489