1# Copyright (C) 2001-2007 Python Software Foundation 2# Author: Ben Gertzfield, Barry Warsaw 3# Contact: email-sig@python.org 4 5__all__ = [ 6 'Charset', 7 'add_alias', 8 'add_charset', 9 'add_codec', 10 ] 11 12from functools import partial 13 14import email.base64mime 15import email.quoprimime 16 17from email import errors 18from email.encoders import encode_7or8bit 19 20 21# Flags for types of header encodings 22QP = 1 # Quoted-Printable 23BASE64 = 2 # Base64 24SHORTEST = 3 # the shorter of QP and base64, but only for headers 25 26# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 27RFC2047_CHROME_LEN = 7 28 29DEFAULT_CHARSET = 'us-ascii' 30UNKNOWN8BIT = 'unknown-8bit' 31EMPTYSTRING = '' 32 33 34# Defaults 35CHARSETS = { 36 # input header enc body enc output conv 37 'iso-8859-1': (QP, QP, None), 38 'iso-8859-2': (QP, QP, None), 39 'iso-8859-3': (QP, QP, None), 40 'iso-8859-4': (QP, QP, None), 41 # iso-8859-5 is Cyrillic, and not especially used 42 # iso-8859-6 is Arabic, also not particularly used 43 # iso-8859-7 is Greek, QP will not make it readable 44 # iso-8859-8 is Hebrew, QP will not make it readable 45 'iso-8859-9': (QP, QP, None), 46 'iso-8859-10': (QP, QP, None), 47 # iso-8859-11 is Thai, QP will not make it readable 48 'iso-8859-13': (QP, QP, None), 49 'iso-8859-14': (QP, QP, None), 50 'iso-8859-15': (QP, QP, None), 51 'iso-8859-16': (QP, QP, None), 52 'windows-1252':(QP, QP, None), 53 'viscii': (QP, QP, None), 54 'us-ascii': (None, None, None), 55 'big5': (BASE64, BASE64, None), 56 'gb2312': (BASE64, BASE64, None), 57 'euc-jp': (BASE64, None, 'iso-2022-jp'), 58 'shift_jis': (BASE64, None, 'iso-2022-jp'), 59 'iso-2022-jp': (BASE64, None, None), 60 'koi8-r': (BASE64, BASE64, None), 61 'utf-8': (SHORTEST, BASE64, 'utf-8'), 62 } 63 64# Aliases for other commonly-used names for character sets. Map 65# them to the real ones used in email. 66ALIASES = { 67 'latin_1': 'iso-8859-1', 68 'latin-1': 'iso-8859-1', 69 'latin_2': 'iso-8859-2', 70 'latin-2': 'iso-8859-2', 71 'latin_3': 'iso-8859-3', 72 'latin-3': 'iso-8859-3', 73 'latin_4': 'iso-8859-4', 74 'latin-4': 'iso-8859-4', 75 'latin_5': 'iso-8859-9', 76 'latin-5': 'iso-8859-9', 77 'latin_6': 'iso-8859-10', 78 'latin-6': 'iso-8859-10', 79 'latin_7': 'iso-8859-13', 80 'latin-7': 'iso-8859-13', 81 'latin_8': 'iso-8859-14', 82 'latin-8': 'iso-8859-14', 83 'latin_9': 'iso-8859-15', 84 'latin-9': 'iso-8859-15', 85 'latin_10':'iso-8859-16', 86 'latin-10':'iso-8859-16', 87 'cp949': 'ks_c_5601-1987', 88 'euc_jp': 'euc-jp', 89 'euc_kr': 'euc-kr', 90 'ascii': 'us-ascii', 91 } 92 93 94# Map charsets to their Unicode codec strings. 95CODEC_MAP = { 96 'gb2312': 'eucgb2312_cn', 97 'big5': 'big5_tw', 98 # Hack: We don't want *any* conversion for stuff marked us-ascii, as all 99 # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. 100 # Let that stuff pass through without conversion to/from Unicode. 101 'us-ascii': None, 102 } 103 104 105# Convenience functions for extending the above mappings 106def add_charset(charset, header_enc=None, body_enc=None, output_charset=None): 107 """Add character set properties to the global registry. 108 109 charset is the input character set, and must be the canonical name of a 110 character set. 111 112 Optional header_enc and body_enc is either charset.QP for 113 quoted-printable, charset.BASE64 for base64 encoding, charset.SHORTEST for 114 the shortest of qp or base64 encoding, or None for no encoding. SHORTEST 115 is only valid for header_enc. It describes how message headers and 116 message bodies in the input charset are to be encoded. Default is no 117 encoding. 118 119 Optional output_charset is the character set that the output should be 120 in. Conversions will proceed from input charset, to Unicode, to the 121 output charset when the method Charset.convert() is called. The default 122 is to output in the same character set as the input. 123 124 Both input_charset and output_charset must have Unicode codec entries in 125 the module's charset-to-codec mapping; use add_codec(charset, codecname) 126 to add codecs the module does not know about. See the codecs module's 127 documentation for more information. 128 """ 129 if body_enc == SHORTEST: 130 raise ValueError('SHORTEST not allowed for body_enc') 131 CHARSETS[charset] = (header_enc, body_enc, output_charset) 132 133 134def add_alias(alias, canonical): 135 """Add a character set alias. 136 137 alias is the alias name, e.g. latin-1 138 canonical is the character set's canonical name, e.g. iso-8859-1 139 """ 140 ALIASES[alias] = canonical 141 142 143def add_codec(charset, codecname): 144 """Add a codec that map characters in the given charset to/from Unicode. 145 146 charset is the canonical name of a character set. codecname is the name 147 of a Python codec, as appropriate for the second argument to the unicode() 148 built-in, or to the encode() method of a Unicode string. 149 """ 150 CODEC_MAP[charset] = codecname 151 152 153# Convenience function for encoding strings, taking into account 154# that they might be unknown-8bit (ie: have surrogate-escaped bytes) 155def _encode(string, codec): 156 if codec == UNKNOWN8BIT: 157 return string.encode('ascii', 'surrogateescape') 158 else: 159 return string.encode(codec) 160 161 162class Charset: 163 """Map character sets to their email properties. 164 165 This class provides information about the requirements imposed on email 166 for a specific character set. It also provides convenience routines for 167 converting between character sets, given the availability of the 168 applicable codecs. Given a character set, it will do its best to provide 169 information on how to use that character set in an email in an 170 RFC-compliant way. 171 172 Certain character sets must be encoded with quoted-printable or base64 173 when used in email headers or bodies. Certain character sets must be 174 converted outright, and are not allowed in email. Instances of this 175 module expose the following information about a character set: 176 177 input_charset: The initial character set specified. Common aliases 178 are converted to their `official' email names (e.g. latin_1 179 is converted to iso-8859-1). Defaults to 7-bit us-ascii. 180 181 header_encoding: If the character set must be encoded before it can be 182 used in an email header, this attribute will be set to 183 charset.QP (for quoted-printable), charset.BASE64 (for 184 base64 encoding), or charset.SHORTEST for the shortest of 185 QP or BASE64 encoding. Otherwise, it will be None. 186 187 body_encoding: Same as header_encoding, but describes the encoding for the 188 mail message's body, which indeed may be different than the 189 header encoding. charset.SHORTEST is not allowed for 190 body_encoding. 191 192 output_charset: Some character sets must be converted before they can be 193 used in email headers or bodies. If the input_charset is 194 one of them, this attribute will contain the name of the 195 charset output will be converted to. Otherwise, it will 196 be None. 197 198 input_codec: The name of the Python codec used to convert the 199 input_charset to Unicode. If no conversion codec is 200 necessary, this attribute will be None. 201 202 output_codec: The name of the Python codec used to convert Unicode 203 to the output_charset. If no conversion codec is necessary, 204 this attribute will have the same value as the input_codec. 205 """ 206 def __init__(self, input_charset=DEFAULT_CHARSET): 207 # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to 208 # unicode because its .lower() is locale insensitive. If the argument 209 # is already a unicode, we leave it at that, but ensure that the 210 # charset is ASCII, as the standard (RFC XXX) requires. 211 try: 212 if isinstance(input_charset, str): 213 input_charset.encode('ascii') 214 else: 215 input_charset = str(input_charset, 'ascii') 216 except UnicodeError: 217 raise errors.CharsetError(input_charset) 218 input_charset = input_charset.lower() 219 # Set the input charset after filtering through the aliases 220 self.input_charset = ALIASES.get(input_charset, input_charset) 221 # We can try to guess which encoding and conversion to use by the 222 # charset_map dictionary. Try that first, but let the user override 223 # it. 224 henc, benc, conv = CHARSETS.get(self.input_charset, 225 (SHORTEST, BASE64, None)) 226 if not conv: 227 conv = self.input_charset 228 # Set the attributes, allowing the arguments to override the default. 229 self.header_encoding = henc 230 self.body_encoding = benc 231 self.output_charset = ALIASES.get(conv, conv) 232 # Now set the codecs. If one isn't defined for input_charset, 233 # guess and try a Unicode codec with the same name as input_codec. 234 self.input_codec = CODEC_MAP.get(self.input_charset, 235 self.input_charset) 236 self.output_codec = CODEC_MAP.get(self.output_charset, 237 self.output_charset) 238 239 def __repr__(self): 240 return self.input_charset.lower() 241 242 def __eq__(self, other): 243 return str(self) == str(other).lower() 244 245 def get_body_encoding(self): 246 """Return the content-transfer-encoding used for body encoding. 247 248 This is either the string `quoted-printable' or `base64' depending on 249 the encoding used, or it is a function in which case you should call 250 the function with a single argument, the Message object being 251 encoded. The function should then set the Content-Transfer-Encoding 252 header itself to whatever is appropriate. 253 254 Returns "quoted-printable" if self.body_encoding is QP. 255 Returns "base64" if self.body_encoding is BASE64. 256 Returns conversion function otherwise. 257 """ 258 assert self.body_encoding != SHORTEST 259 if self.body_encoding == QP: 260 return 'quoted-printable' 261 elif self.body_encoding == BASE64: 262 return 'base64' 263 else: 264 return encode_7or8bit 265 266 def get_output_charset(self): 267 """Return the output character set. 268 269 This is self.output_charset if that is not None, otherwise it is 270 self.input_charset. 271 """ 272 return self.output_charset or self.input_charset 273 274 def header_encode(self, string): 275 """Header-encode a string by converting it first to bytes. 276 277 The type of encoding (base64 or quoted-printable) will be based on 278 this charset's `header_encoding`. 279 280 :param string: A unicode string for the header. It must be possible 281 to encode this string to bytes using the character set's 282 output codec. 283 :return: The encoded string, with RFC 2047 chrome. 284 """ 285 codec = self.output_codec or 'us-ascii' 286 header_bytes = _encode(string, codec) 287 # 7bit/8bit encodings return the string unchanged (modulo conversions) 288 encoder_module = self._get_encoder(header_bytes) 289 if encoder_module is None: 290 return string 291 return encoder_module.header_encode(header_bytes, codec) 292 293 def header_encode_lines(self, string, maxlengths): 294 """Header-encode a string by converting it first to bytes. 295 296 This is similar to `header_encode()` except that the string is fit 297 into maximum line lengths as given by the argument. 298 299 :param string: A unicode string for the header. It must be possible 300 to encode this string to bytes using the character set's 301 output codec. 302 :param maxlengths: Maximum line length iterator. Each element 303 returned from this iterator will provide the next maximum line 304 length. This parameter is used as an argument to built-in next() 305 and should never be exhausted. The maximum line lengths should 306 not count the RFC 2047 chrome. These line lengths are only a 307 hint; the splitter does the best it can. 308 :return: Lines of encoded strings, each with RFC 2047 chrome. 309 """ 310 # See which encoding we should use. 311 codec = self.output_codec or 'us-ascii' 312 header_bytes = _encode(string, codec) 313 encoder_module = self._get_encoder(header_bytes) 314 encoder = partial(encoder_module.header_encode, charset=codec) 315 # Calculate the number of characters that the RFC 2047 chrome will 316 # contribute to each line. 317 charset = self.get_output_charset() 318 extra = len(charset) + RFC2047_CHROME_LEN 319 # Now comes the hard part. We must encode bytes but we can't split on 320 # bytes because some character sets are variable length and each 321 # encoded word must stand on its own. So the problem is you have to 322 # encode to bytes to figure out this word's length, but you must split 323 # on characters. This causes two problems: first, we don't know how 324 # many octets a specific substring of unicode characters will get 325 # encoded to, and second, we don't know how many ASCII characters 326 # those octets will get encoded to. Unless we try it. Which seems 327 # inefficient. In the interest of being correct rather than fast (and 328 # in the hope that there will be few encoded headers in any such 329 # message), brute force it. :( 330 lines = [] 331 current_line = [] 332 maxlen = next(maxlengths) - extra 333 for character in string: 334 current_line.append(character) 335 this_line = EMPTYSTRING.join(current_line) 336 length = encoder_module.header_length(_encode(this_line, charset)) 337 if length > maxlen: 338 # This last character doesn't fit so pop it off. 339 current_line.pop() 340 # Does nothing fit on the first line? 341 if not lines and not current_line: 342 lines.append(None) 343 else: 344 joined_line = EMPTYSTRING.join(current_line) 345 header_bytes = _encode(joined_line, codec) 346 lines.append(encoder(header_bytes)) 347 current_line = [character] 348 maxlen = next(maxlengths) - extra 349 joined_line = EMPTYSTRING.join(current_line) 350 header_bytes = _encode(joined_line, codec) 351 lines.append(encoder(header_bytes)) 352 return lines 353 354 def _get_encoder(self, header_bytes): 355 if self.header_encoding == BASE64: 356 return email.base64mime 357 elif self.header_encoding == QP: 358 return email.quoprimime 359 elif self.header_encoding == SHORTEST: 360 len64 = email.base64mime.header_length(header_bytes) 361 lenqp = email.quoprimime.header_length(header_bytes) 362 if len64 < lenqp: 363 return email.base64mime 364 else: 365 return email.quoprimime 366 else: 367 return None 368 369 def body_encode(self, string): 370 """Body-encode a string by converting it first to bytes. 371 372 The type of encoding (base64 or quoted-printable) will be based on 373 self.body_encoding. If body_encoding is None, we assume the 374 output charset is a 7bit encoding, so re-encoding the decoded 375 string using the ascii codec produces the correct string version 376 of the content. 377 """ 378 if not string: 379 return string 380 if self.body_encoding is BASE64: 381 if isinstance(string, str): 382 string = string.encode(self.output_charset) 383 return email.base64mime.body_encode(string) 384 elif self.body_encoding is QP: 385 # quopromime.body_encode takes a string, but operates on it as if 386 # it were a list of byte codes. For a (minimal) history on why 387 # this is so, see changeset 0cf700464177. To correctly encode a 388 # character set, then, we must turn it into pseudo bytes via the 389 # latin1 charset, which will encode any byte as a single code point 390 # between 0 and 255, which is what body_encode is expecting. 391 if isinstance(string, str): 392 string = string.encode(self.output_charset) 393 string = string.decode('latin1') 394 return email.quoprimime.body_encode(string) 395 else: 396 if isinstance(string, str): 397 string = string.encode(self.output_charset).decode('ascii') 398 return string 399