1"""Header value parser implementing various email-related RFC parsing rules. 2 3The parsing methods defined in this module implement various email related 4parsing rules. Principal among them is RFC 5322, which is the followon 5to RFC 2822 and primarily a clarification of the former. It also implements 6RFC 2047 encoded word decoding. 7 8RFC 5322 goes to considerable trouble to maintain backward compatibility with 9RFC 822 in the parse phase, while cleaning up the structure on the generation 10phase. This parser supports correct RFC 5322 generation by tagging white space 11as folding white space only when folding is allowed in the non-obsolete rule 12sets. Actually, the parser is even more generous when accepting input than RFC 135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. 14Where possible deviations from the standard are annotated on the 'defects' 15attribute of tokens that deviate. 16 17The general structure of the parser follows RFC 5322, and uses its terminology 18where there is a direct correspondence. Where the implementation requires a 19somewhat different structure than that used by the formal grammar, new terms 20that mimic the closest existing terms are used. Thus, it really helps to have 21a copy of RFC 5322 handy when studying this code. 22 23Input to the parser is a string that has already been unfolded according to 24RFC 5322 rules. According to the RFC this unfolding is the very first step, and 25this parser leaves the unfolding step to a higher level message parser, which 26will have already detected the line breaks that need unfolding while 27determining the beginning and end of each header. 28 29The output of the parser is a TokenList object, which is a list subclass. A 30TokenList is a recursive data structure. The terminal nodes of the structure 31are Terminal objects, which are subclasses of str. These do not correspond 32directly to terminal objects in the formal grammar, but are instead more 33practical higher level combinations of true terminals. 34 35All TokenList and Terminal objects have a 'value' attribute, which produces the 36semantically meaningful value of that part of the parse subtree. The value of 37all whitespace tokens (no matter how many sub-tokens they may contain) is a 38single space, as per the RFC rules. This includes 'CFWS', which is herein 39included in the general class of whitespace tokens. There is one exception to 40the rule that whitespace tokens are collapsed into single spaces in values: in 41the value of a 'bare-quoted-string' (a quoted-string with no leading or 42trailing whitespace), any whitespace that appeared between the quotation marks 43is preserved in the returned value. Note that in all Terminal strings quoted 44pairs are turned into their unquoted values. 45 46All TokenList and Terminal objects also have a string value, which attempts to 47be a "canonical" representation of the RFC-compliant form of the substring that 48produced the parsed subtree, including minimal use of quoted pair quoting. 49Whitespace runs are not collapsed. 50 51Comment tokens also have a 'content' attribute providing the string found 52between the parens (including any nested comments) with whitespace preserved. 53 54All TokenList and Terminal objects have a 'defects' attribute which is a 55possibly empty list all of the defects found while creating the token. Defects 56may appear on any token in the tree, and a composite list of all defects in the 57subtree is available through the 'all_defects' attribute of any node. (For 58Terminal notes x.defects == x.all_defects.) 59 60Each object in a parse tree is called a 'token', and each has a 'token_type' 61attribute that gives the name from the RFC 5322 grammar that it represents. 62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that 63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. 64It is returned in place of lists of (ctext/quoted-pair) and 65(qtext/quoted-pair). 66 67XXX: provide complete list of token types. 68""" 69 70import re 71import sys 72import urllib # For urllib.parse.unquote 73from string import hexdigits 74from operator import itemgetter 75from email import _encoded_words as _ew 76from email import errors 77from email import utils 78 79# 80# Useful constants and functions 81# 82 83WSP = set(' \t') 84CFWS_LEADER = WSP | set('(') 85SPECIALS = set(r'()<>@,:;.\"[]') 86ATOM_ENDS = SPECIALS | WSP 87DOT_ATOM_ENDS = ATOM_ENDS - set('.') 88# '.', '"', and '(' do not end phrases in order to support obs-phrase 89PHRASE_ENDS = SPECIALS - set('."(') 90TSPECIALS = (SPECIALS | set('/?=')) - set('.') 91TOKEN_ENDS = TSPECIALS | WSP 92ASPECIALS = TSPECIALS | set("*'%") 93ATTRIBUTE_ENDS = ASPECIALS | WSP 94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') 95NLSET = {'\n', '\r'} 96SPECIALSNL = SPECIALS | NLSET 97 98def quote_string(value): 99 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' 100 101# Match a RFC 2047 word, looks like =?utf-8?q?someword?= 102rfc2047_matcher = re.compile(r''' 103 =\? # literal =? 104 [^?]* # charset 105 \? # literal ? 106 [qQbB] # literal 'q' or 'b', case insensitive 107 \? # literal ? 108 .*? # encoded word 109 \?= # literal ?= 110''', re.VERBOSE | re.MULTILINE) 111 112 113# 114# TokenList and its subclasses 115# 116 117class TokenList(list): 118 119 token_type = None 120 syntactic_break = True 121 ew_combine_allowed = True 122 123 def __init__(self, *args, **kw): 124 super().__init__(*args, **kw) 125 self.defects = [] 126 127 def __str__(self): 128 return ''.join(str(x) for x in self) 129 130 def __repr__(self): 131 return '{}({})'.format(self.__class__.__name__, 132 super().__repr__()) 133 134 @property 135 def value(self): 136 return ''.join(x.value for x in self if x.value) 137 138 @property 139 def all_defects(self): 140 return sum((x.all_defects for x in self), self.defects) 141 142 def startswith_fws(self): 143 return self[0].startswith_fws() 144 145 @property 146 def as_ew_allowed(self): 147 """True if all top level tokens of this part may be RFC2047 encoded.""" 148 return all(part.as_ew_allowed for part in self) 149 150 @property 151 def comments(self): 152 comments = [] 153 for token in self: 154 comments.extend(token.comments) 155 return comments 156 157 def fold(self, *, policy): 158 return _refold_parse_tree(self, policy=policy) 159 160 def pprint(self, indent=''): 161 print(self.ppstr(indent=indent)) 162 163 def ppstr(self, indent=''): 164 return '\n'.join(self._pp(indent=indent)) 165 166 def _pp(self, indent=''): 167 yield '{}{}/{}('.format( 168 indent, 169 self.__class__.__name__, 170 self.token_type) 171 for token in self: 172 if not hasattr(token, '_pp'): 173 yield (indent + ' !! invalid element in token ' 174 'list: {!r}'.format(token)) 175 else: 176 yield from token._pp(indent+' ') 177 if self.defects: 178 extra = ' Defects: {}'.format(self.defects) 179 else: 180 extra = '' 181 yield '{}){}'.format(indent, extra) 182 183 184class WhiteSpaceTokenList(TokenList): 185 186 @property 187 def value(self): 188 return ' ' 189 190 @property 191 def comments(self): 192 return [x.content for x in self if x.token_type=='comment'] 193 194 195class UnstructuredTokenList(TokenList): 196 token_type = 'unstructured' 197 198 199class Phrase(TokenList): 200 token_type = 'phrase' 201 202class Word(TokenList): 203 token_type = 'word' 204 205 206class CFWSList(WhiteSpaceTokenList): 207 token_type = 'cfws' 208 209 210class Atom(TokenList): 211 token_type = 'atom' 212 213 214class Token(TokenList): 215 token_type = 'token' 216 encode_as_ew = False 217 218 219class EncodedWord(TokenList): 220 token_type = 'encoded-word' 221 cte = None 222 charset = None 223 lang = None 224 225 226class QuotedString(TokenList): 227 228 token_type = 'quoted-string' 229 230 @property 231 def content(self): 232 for x in self: 233 if x.token_type == 'bare-quoted-string': 234 return x.value 235 236 @property 237 def quoted_value(self): 238 res = [] 239 for x in self: 240 if x.token_type == 'bare-quoted-string': 241 res.append(str(x)) 242 else: 243 res.append(x.value) 244 return ''.join(res) 245 246 @property 247 def stripped_value(self): 248 for token in self: 249 if token.token_type == 'bare-quoted-string': 250 return token.value 251 252 253class BareQuotedString(QuotedString): 254 255 token_type = 'bare-quoted-string' 256 257 def __str__(self): 258 return quote_string(''.join(str(x) for x in self)) 259 260 @property 261 def value(self): 262 return ''.join(str(x) for x in self) 263 264 265class Comment(WhiteSpaceTokenList): 266 267 token_type = 'comment' 268 269 def __str__(self): 270 return ''.join(sum([ 271 ["("], 272 [self.quote(x) for x in self], 273 [")"], 274 ], [])) 275 276 def quote(self, value): 277 if value.token_type == 'comment': 278 return str(value) 279 return str(value).replace('\\', '\\\\').replace( 280 '(', r'\(').replace( 281 ')', r'\)') 282 283 @property 284 def content(self): 285 return ''.join(str(x) for x in self) 286 287 @property 288 def comments(self): 289 return [self.content] 290 291class AddressList(TokenList): 292 293 token_type = 'address-list' 294 295 @property 296 def addresses(self): 297 return [x for x in self if x.token_type=='address'] 298 299 @property 300 def mailboxes(self): 301 return sum((x.mailboxes 302 for x in self if x.token_type=='address'), []) 303 304 @property 305 def all_mailboxes(self): 306 return sum((x.all_mailboxes 307 for x in self if x.token_type=='address'), []) 308 309 310class Address(TokenList): 311 312 token_type = 'address' 313 314 @property 315 def display_name(self): 316 if self[0].token_type == 'group': 317 return self[0].display_name 318 319 @property 320 def mailboxes(self): 321 if self[0].token_type == 'mailbox': 322 return [self[0]] 323 elif self[0].token_type == 'invalid-mailbox': 324 return [] 325 return self[0].mailboxes 326 327 @property 328 def all_mailboxes(self): 329 if self[0].token_type == 'mailbox': 330 return [self[0]] 331 elif self[0].token_type == 'invalid-mailbox': 332 return [self[0]] 333 return self[0].all_mailboxes 334 335class MailboxList(TokenList): 336 337 token_type = 'mailbox-list' 338 339 @property 340 def mailboxes(self): 341 return [x for x in self if x.token_type=='mailbox'] 342 343 @property 344 def all_mailboxes(self): 345 return [x for x in self 346 if x.token_type in ('mailbox', 'invalid-mailbox')] 347 348 349class GroupList(TokenList): 350 351 token_type = 'group-list' 352 353 @property 354 def mailboxes(self): 355 if not self or self[0].token_type != 'mailbox-list': 356 return [] 357 return self[0].mailboxes 358 359 @property 360 def all_mailboxes(self): 361 if not self or self[0].token_type != 'mailbox-list': 362 return [] 363 return self[0].all_mailboxes 364 365 366class Group(TokenList): 367 368 token_type = "group" 369 370 @property 371 def mailboxes(self): 372 if self[2].token_type != 'group-list': 373 return [] 374 return self[2].mailboxes 375 376 @property 377 def all_mailboxes(self): 378 if self[2].token_type != 'group-list': 379 return [] 380 return self[2].all_mailboxes 381 382 @property 383 def display_name(self): 384 return self[0].display_name 385 386 387class NameAddr(TokenList): 388 389 token_type = 'name-addr' 390 391 @property 392 def display_name(self): 393 if len(self) == 1: 394 return None 395 return self[0].display_name 396 397 @property 398 def local_part(self): 399 return self[-1].local_part 400 401 @property 402 def domain(self): 403 return self[-1].domain 404 405 @property 406 def route(self): 407 return self[-1].route 408 409 @property 410 def addr_spec(self): 411 return self[-1].addr_spec 412 413 414class AngleAddr(TokenList): 415 416 token_type = 'angle-addr' 417 418 @property 419 def local_part(self): 420 for x in self: 421 if x.token_type == 'addr-spec': 422 return x.local_part 423 424 @property 425 def domain(self): 426 for x in self: 427 if x.token_type == 'addr-spec': 428 return x.domain 429 430 @property 431 def route(self): 432 for x in self: 433 if x.token_type == 'obs-route': 434 return x.domains 435 436 @property 437 def addr_spec(self): 438 for x in self: 439 if x.token_type == 'addr-spec': 440 if x.local_part: 441 return x.addr_spec 442 else: 443 return quote_string(x.local_part) + x.addr_spec 444 else: 445 return '<>' 446 447 448class ObsRoute(TokenList): 449 450 token_type = 'obs-route' 451 452 @property 453 def domains(self): 454 return [x.domain for x in self if x.token_type == 'domain'] 455 456 457class Mailbox(TokenList): 458 459 token_type = 'mailbox' 460 461 @property 462 def display_name(self): 463 if self[0].token_type == 'name-addr': 464 return self[0].display_name 465 466 @property 467 def local_part(self): 468 return self[0].local_part 469 470 @property 471 def domain(self): 472 return self[0].domain 473 474 @property 475 def route(self): 476 if self[0].token_type == 'name-addr': 477 return self[0].route 478 479 @property 480 def addr_spec(self): 481 return self[0].addr_spec 482 483 484class InvalidMailbox(TokenList): 485 486 token_type = 'invalid-mailbox' 487 488 @property 489 def display_name(self): 490 return None 491 492 local_part = domain = route = addr_spec = display_name 493 494 495class Domain(TokenList): 496 497 token_type = 'domain' 498 as_ew_allowed = False 499 500 @property 501 def domain(self): 502 return ''.join(super().value.split()) 503 504 505class DotAtom(TokenList): 506 token_type = 'dot-atom' 507 508 509class DotAtomText(TokenList): 510 token_type = 'dot-atom-text' 511 as_ew_allowed = True 512 513 514class NoFoldLiteral(TokenList): 515 token_type = 'no-fold-literal' 516 as_ew_allowed = False 517 518 519class AddrSpec(TokenList): 520 521 token_type = 'addr-spec' 522 as_ew_allowed = False 523 524 @property 525 def local_part(self): 526 return self[0].local_part 527 528 @property 529 def domain(self): 530 if len(self) < 3: 531 return None 532 return self[-1].domain 533 534 @property 535 def value(self): 536 if len(self) < 3: 537 return self[0].value 538 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() 539 540 @property 541 def addr_spec(self): 542 nameset = set(self.local_part) 543 if len(nameset) > len(nameset-DOT_ATOM_ENDS): 544 lp = quote_string(self.local_part) 545 else: 546 lp = self.local_part 547 if self.domain is not None: 548 return lp + '@' + self.domain 549 return lp 550 551 552class ObsLocalPart(TokenList): 553 554 token_type = 'obs-local-part' 555 as_ew_allowed = False 556 557 558class DisplayName(Phrase): 559 560 token_type = 'display-name' 561 ew_combine_allowed = False 562 563 @property 564 def display_name(self): 565 res = TokenList(self) 566 if len(res) == 0: 567 return res.value 568 if res[0].token_type == 'cfws': 569 res.pop(0) 570 else: 571 if (isinstance(res[0], TokenList) and 572 res[0][0].token_type == 'cfws'): 573 res[0] = TokenList(res[0][1:]) 574 if res[-1].token_type == 'cfws': 575 res.pop() 576 else: 577 if (isinstance(res[-1], TokenList) and 578 res[-1][-1].token_type == 'cfws'): 579 res[-1] = TokenList(res[-1][:-1]) 580 return res.value 581 582 @property 583 def value(self): 584 quote = False 585 if self.defects: 586 quote = True 587 else: 588 for x in self: 589 if x.token_type == 'quoted-string': 590 quote = True 591 if len(self) != 0 and quote: 592 pre = post = '' 593 if (self[0].token_type == 'cfws' or 594 isinstance(self[0], TokenList) and 595 self[0][0].token_type == 'cfws'): 596 pre = ' ' 597 if (self[-1].token_type == 'cfws' or 598 isinstance(self[-1], TokenList) and 599 self[-1][-1].token_type == 'cfws'): 600 post = ' ' 601 return pre+quote_string(self.display_name)+post 602 else: 603 return super().value 604 605 606class LocalPart(TokenList): 607 608 token_type = 'local-part' 609 as_ew_allowed = False 610 611 @property 612 def value(self): 613 if self[0].token_type == "quoted-string": 614 return self[0].quoted_value 615 else: 616 return self[0].value 617 618 @property 619 def local_part(self): 620 # Strip whitespace from front, back, and around dots. 621 res = [DOT] 622 last = DOT 623 last_is_tl = False 624 for tok in self[0] + [DOT]: 625 if tok.token_type == 'cfws': 626 continue 627 if (last_is_tl and tok.token_type == 'dot' and 628 last[-1].token_type == 'cfws'): 629 res[-1] = TokenList(last[:-1]) 630 is_tl = isinstance(tok, TokenList) 631 if (is_tl and last.token_type == 'dot' and 632 tok[0].token_type == 'cfws'): 633 res.append(TokenList(tok[1:])) 634 else: 635 res.append(tok) 636 last = res[-1] 637 last_is_tl = is_tl 638 res = TokenList(res[1:-1]) 639 return res.value 640 641 642class DomainLiteral(TokenList): 643 644 token_type = 'domain-literal' 645 as_ew_allowed = False 646 647 @property 648 def domain(self): 649 return ''.join(super().value.split()) 650 651 @property 652 def ip(self): 653 for x in self: 654 if x.token_type == 'ptext': 655 return x.value 656 657 658class MIMEVersion(TokenList): 659 660 token_type = 'mime-version' 661 major = None 662 minor = None 663 664 665class Parameter(TokenList): 666 667 token_type = 'parameter' 668 sectioned = False 669 extended = False 670 charset = 'us-ascii' 671 672 @property 673 def section_number(self): 674 # Because the first token, the attribute (name) eats CFWS, the second 675 # token is always the section if there is one. 676 return self[1].number if self.sectioned else 0 677 678 @property 679 def param_value(self): 680 # This is part of the "handle quoted extended parameters" hack. 681 for token in self: 682 if token.token_type == 'value': 683 return token.stripped_value 684 if token.token_type == 'quoted-string': 685 for token in token: 686 if token.token_type == 'bare-quoted-string': 687 for token in token: 688 if token.token_type == 'value': 689 return token.stripped_value 690 return '' 691 692 693class InvalidParameter(Parameter): 694 695 token_type = 'invalid-parameter' 696 697 698class Attribute(TokenList): 699 700 token_type = 'attribute' 701 702 @property 703 def stripped_value(self): 704 for token in self: 705 if token.token_type.endswith('attrtext'): 706 return token.value 707 708class Section(TokenList): 709 710 token_type = 'section' 711 number = None 712 713 714class Value(TokenList): 715 716 token_type = 'value' 717 718 @property 719 def stripped_value(self): 720 token = self[0] 721 if token.token_type == 'cfws': 722 token = self[1] 723 if token.token_type.endswith( 724 ('quoted-string', 'attribute', 'extended-attribute')): 725 return token.stripped_value 726 return self.value 727 728 729class MimeParameters(TokenList): 730 731 token_type = 'mime-parameters' 732 syntactic_break = False 733 734 @property 735 def params(self): 736 # The RFC specifically states that the ordering of parameters is not 737 # guaranteed and may be reordered by the transport layer. So we have 738 # to assume the RFC 2231 pieces can come in any order. However, we 739 # output them in the order that we first see a given name, which gives 740 # us a stable __str__. 741 params = {} # Using order preserving dict from Python 3.7+ 742 for token in self: 743 if not token.token_type.endswith('parameter'): 744 continue 745 if token[0].token_type != 'attribute': 746 continue 747 name = token[0].value.strip() 748 if name not in params: 749 params[name] = [] 750 params[name].append((token.section_number, token)) 751 for name, parts in params.items(): 752 parts = sorted(parts, key=itemgetter(0)) 753 first_param = parts[0][1] 754 charset = first_param.charset 755 # Our arbitrary error recovery is to ignore duplicate parameters, 756 # to use appearance order if there are duplicate rfc 2231 parts, 757 # and to ignore gaps. This mimics the error recovery of get_param. 758 if not first_param.extended and len(parts) > 1: 759 if parts[1][0] == 0: 760 parts[1][1].defects.append(errors.InvalidHeaderDefect( 761 'duplicate parameter name; duplicate(s) ignored')) 762 parts = parts[:1] 763 # Else assume the *0* was missing...note that this is different 764 # from get_param, but we registered a defect for this earlier. 765 value_parts = [] 766 i = 0 767 for section_number, param in parts: 768 if section_number != i: 769 # We could get fancier here and look for a complete 770 # duplicate extended parameter and ignore the second one 771 # seen. But we're not doing that. The old code didn't. 772 if not param.extended: 773 param.defects.append(errors.InvalidHeaderDefect( 774 'duplicate parameter name; duplicate ignored')) 775 continue 776 else: 777 param.defects.append(errors.InvalidHeaderDefect( 778 "inconsistent RFC2231 parameter numbering")) 779 i += 1 780 value = param.param_value 781 if param.extended: 782 try: 783 value = urllib.parse.unquote_to_bytes(value) 784 except UnicodeEncodeError: 785 # source had surrogate escaped bytes. What we do now 786 # is a bit of an open question. I'm not sure this is 787 # the best choice, but it is what the old algorithm did 788 value = urllib.parse.unquote(value, encoding='latin-1') 789 else: 790 try: 791 value = value.decode(charset, 'surrogateescape') 792 except (LookupError, UnicodeEncodeError): 793 # XXX: there should really be a custom defect for 794 # unknown character set to make it easy to find, 795 # because otherwise unknown charset is a silent 796 # failure. 797 value = value.decode('us-ascii', 'surrogateescape') 798 if utils._has_surrogates(value): 799 param.defects.append(errors.UndecodableBytesDefect()) 800 value_parts.append(value) 801 value = ''.join(value_parts) 802 yield name, value 803 804 def __str__(self): 805 params = [] 806 for name, value in self.params: 807 if value: 808 params.append('{}={}'.format(name, quote_string(value))) 809 else: 810 params.append(name) 811 params = '; '.join(params) 812 return ' ' + params if params else '' 813 814 815class ParameterizedHeaderValue(TokenList): 816 817 # Set this false so that the value doesn't wind up on a new line even 818 # if it and the parameters would fit there but not on the first line. 819 syntactic_break = False 820 821 @property 822 def params(self): 823 for token in reversed(self): 824 if token.token_type == 'mime-parameters': 825 return token.params 826 return {} 827 828 829class ContentType(ParameterizedHeaderValue): 830 token_type = 'content-type' 831 as_ew_allowed = False 832 maintype = 'text' 833 subtype = 'plain' 834 835 836class ContentDisposition(ParameterizedHeaderValue): 837 token_type = 'content-disposition' 838 as_ew_allowed = False 839 content_disposition = None 840 841 842class ContentTransferEncoding(TokenList): 843 token_type = 'content-transfer-encoding' 844 as_ew_allowed = False 845 cte = '7bit' 846 847 848class HeaderLabel(TokenList): 849 token_type = 'header-label' 850 as_ew_allowed = False 851 852 853class MsgID(TokenList): 854 token_type = 'msg-id' 855 as_ew_allowed = False 856 857 def fold(self, policy): 858 # message-id tokens may not be folded. 859 return str(self) + policy.linesep 860 861 862class MessageID(MsgID): 863 token_type = 'message-id' 864 865 866class InvalidMessageID(MessageID): 867 token_type = 'invalid-message-id' 868 869 870class Header(TokenList): 871 token_type = 'header' 872 873 874# 875# Terminal classes and instances 876# 877 878class Terminal(str): 879 880 as_ew_allowed = True 881 ew_combine_allowed = True 882 syntactic_break = True 883 884 def __new__(cls, value, token_type): 885 self = super().__new__(cls, value) 886 self.token_type = token_type 887 self.defects = [] 888 return self 889 890 def __repr__(self): 891 return "{}({})".format(self.__class__.__name__, super().__repr__()) 892 893 def pprint(self): 894 print(self.__class__.__name__ + '/' + self.token_type) 895 896 @property 897 def all_defects(self): 898 return list(self.defects) 899 900 def _pp(self, indent=''): 901 return ["{}{}/{}({}){}".format( 902 indent, 903 self.__class__.__name__, 904 self.token_type, 905 super().__repr__(), 906 '' if not self.defects else ' {}'.format(self.defects), 907 )] 908 909 def pop_trailing_ws(self): 910 # This terminates the recursion. 911 return None 912 913 @property 914 def comments(self): 915 return [] 916 917 def __getnewargs__(self): 918 return(str(self), self.token_type) 919 920 921class WhiteSpaceTerminal(Terminal): 922 923 @property 924 def value(self): 925 return ' ' 926 927 def startswith_fws(self): 928 return True 929 930 931class ValueTerminal(Terminal): 932 933 @property 934 def value(self): 935 return self 936 937 def startswith_fws(self): 938 return False 939 940 941class EWWhiteSpaceTerminal(WhiteSpaceTerminal): 942 943 @property 944 def value(self): 945 return '' 946 947 def __str__(self): 948 return '' 949 950 951class _InvalidEwError(errors.HeaderParseError): 952 """Invalid encoded word found while parsing headers.""" 953 954 955# XXX these need to become classes and used as instances so 956# that a program can't change them in a parse tree and screw 957# up other parse trees. Maybe should have tests for that, too. 958DOT = ValueTerminal('.', 'dot') 959ListSeparator = ValueTerminal(',', 'list-separator') 960ListSeparator.as_ew_allowed = False 961ListSeparator.syntactic_break = False 962RouteComponentMarker = ValueTerminal('@', 'route-component-marker') 963 964# 965# Parser 966# 967 968# Parse strings according to RFC822/2047/2822/5322 rules. 969# 970# This is a stateless parser. Each get_XXX function accepts a string and 971# returns either a Terminal or a TokenList representing the RFC object named 972# by the method and a string containing the remaining unparsed characters 973# from the input. Thus a parser method consumes the next syntactic construct 974# of a given type and returns a token representing the construct plus the 975# unparsed remainder of the input string. 976# 977# For example, if the first element of a structured header is a 'phrase', 978# then: 979# 980# phrase, value = get_phrase(value) 981# 982# returns the complete phrase from the start of the string value, plus any 983# characters left in the string after the phrase is removed. 984 985_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split 986_non_atom_end_matcher = re.compile(r"[^{}]+".format( 987 re.escape(''.join(ATOM_ENDS)))).match 988_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall 989_non_token_end_matcher = re.compile(r"[^{}]+".format( 990 re.escape(''.join(TOKEN_ENDS)))).match 991_non_attribute_end_matcher = re.compile(r"[^{}]+".format( 992 re.escape(''.join(ATTRIBUTE_ENDS)))).match 993_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( 994 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match 995 996def _validate_xtext(xtext): 997 """If input token contains ASCII non-printables, register a defect.""" 998 999 non_printables = _non_printable_finder(xtext) 1000 if non_printables: 1001 xtext.defects.append(errors.NonPrintableDefect(non_printables)) 1002 if utils._has_surrogates(xtext): 1003 xtext.defects.append(errors.UndecodableBytesDefect( 1004 "Non-ASCII characters found in header token")) 1005 1006def _get_ptext_to_endchars(value, endchars): 1007 """Scan printables/quoted-pairs until endchars and return unquoted ptext. 1008 1009 This function turns a run of qcontent, ccontent-without-comments, or 1010 dtext-with-quoted-printables into a single string by unquoting any 1011 quoted printables. It returns the string, the remaining value, and 1012 a flag that is True iff there were any quoted printables decoded. 1013 1014 """ 1015 fragment, *remainder = _wsp_splitter(value, 1) 1016 vchars = [] 1017 escape = False 1018 had_qp = False 1019 for pos in range(len(fragment)): 1020 if fragment[pos] == '\\': 1021 if escape: 1022 escape = False 1023 had_qp = True 1024 else: 1025 escape = True 1026 continue 1027 if escape: 1028 escape = False 1029 elif fragment[pos] in endchars: 1030 break 1031 vchars.append(fragment[pos]) 1032 else: 1033 pos = pos + 1 1034 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp 1035 1036def get_fws(value): 1037 """FWS = 1*WSP 1038 1039 This isn't the RFC definition. We're using fws to represent tokens where 1040 folding can be done, but when we are parsing the *un*folding has already 1041 been done so we don't need to watch out for CRLF. 1042 1043 """ 1044 newvalue = value.lstrip() 1045 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') 1046 return fws, newvalue 1047 1048def get_encoded_word(value): 1049 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 1050 1051 """ 1052 ew = EncodedWord() 1053 if not value.startswith('=?'): 1054 raise errors.HeaderParseError( 1055 "expected encoded word but found {}".format(value)) 1056 tok, *remainder = value[2:].split('?=', 1) 1057 if tok == value[2:]: 1058 raise errors.HeaderParseError( 1059 "expected encoded word but found {}".format(value)) 1060 remstr = ''.join(remainder) 1061 if (len(remstr) > 1 and 1062 remstr[0] in hexdigits and 1063 remstr[1] in hexdigits and 1064 tok.count('?') < 2): 1065 # The ? after the CTE was followed by an encoded word escape (=XX). 1066 rest, *remainder = remstr.split('?=', 1) 1067 tok = tok + '?=' + rest 1068 if len(tok.split()) > 1: 1069 ew.defects.append(errors.InvalidHeaderDefect( 1070 "whitespace inside encoded word")) 1071 ew.cte = value 1072 value = ''.join(remainder) 1073 try: 1074 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') 1075 except (ValueError, KeyError): 1076 raise _InvalidEwError( 1077 "encoded word format invalid: '{}'".format(ew.cte)) 1078 ew.charset = charset 1079 ew.lang = lang 1080 ew.defects.extend(defects) 1081 while text: 1082 if text[0] in WSP: 1083 token, text = get_fws(text) 1084 ew.append(token) 1085 continue 1086 chars, *remainder = _wsp_splitter(text, 1) 1087 vtext = ValueTerminal(chars, 'vtext') 1088 _validate_xtext(vtext) 1089 ew.append(vtext) 1090 text = ''.join(remainder) 1091 # Encoded words should be followed by a WS 1092 if value and value[0] not in WSP: 1093 ew.defects.append(errors.InvalidHeaderDefect( 1094 "missing trailing whitespace after encoded-word")) 1095 return ew, value 1096 1097def get_unstructured(value): 1098 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct 1099 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) 1100 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR 1101 1102 obs-NO-WS-CTL is control characters except WSP/CR/LF. 1103 1104 So, basically, we have printable runs, plus control characters or nulls in 1105 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the 1106 obsolete syntax in its specification, but requires whitespace on either 1107 side of the encoded words, I can see no reason to need to separate the 1108 non-printable-non-whitespace from the printable runs if they occur, so we 1109 parse this into xtext tokens separated by WSP tokens. 1110 1111 Because an 'unstructured' value must by definition constitute the entire 1112 value, this 'get' routine does not return a remaining value, only the 1113 parsed TokenList. 1114 1115 """ 1116 # XXX: but what about bare CR and LF? They might signal the start or 1117 # end of an encoded word. YAGNI for now, since our current parsers 1118 # will never send us strings with bare CR or LF. 1119 1120 unstructured = UnstructuredTokenList() 1121 while value: 1122 if value[0] in WSP: 1123 token, value = get_fws(value) 1124 unstructured.append(token) 1125 continue 1126 valid_ew = True 1127 if value.startswith('=?'): 1128 try: 1129 token, value = get_encoded_word(value) 1130 except _InvalidEwError: 1131 valid_ew = False 1132 except errors.HeaderParseError: 1133 # XXX: Need to figure out how to register defects when 1134 # appropriate here. 1135 pass 1136 else: 1137 have_ws = True 1138 if len(unstructured) > 0: 1139 if unstructured[-1].token_type != 'fws': 1140 unstructured.defects.append(errors.InvalidHeaderDefect( 1141 "missing whitespace before encoded word")) 1142 have_ws = False 1143 if have_ws and len(unstructured) > 1: 1144 if unstructured[-2].token_type == 'encoded-word': 1145 unstructured[-1] = EWWhiteSpaceTerminal( 1146 unstructured[-1], 'fws') 1147 unstructured.append(token) 1148 continue 1149 tok, *remainder = _wsp_splitter(value, 1) 1150 # Split in the middle of an atom if there is a rfc2047 encoded word 1151 # which does not have WSP on both sides. The defect will be registered 1152 # the next time through the loop. 1153 # This needs to only be performed when the encoded word is valid; 1154 # otherwise, performing it on an invalid encoded word can cause 1155 # the parser to go in an infinite loop. 1156 if valid_ew and rfc2047_matcher.search(tok): 1157 tok, *remainder = value.partition('=?') 1158 vtext = ValueTerminal(tok, 'vtext') 1159 _validate_xtext(vtext) 1160 unstructured.append(vtext) 1161 value = ''.join(remainder) 1162 return unstructured 1163 1164def get_qp_ctext(value): 1165 r"""ctext = <printable ascii except \ ( )> 1166 1167 This is not the RFC ctext, since we are handling nested comments in comment 1168 and unquoting quoted-pairs here. We allow anything except the '()' 1169 characters, but if we find any ASCII other than the RFC defined printable 1170 ASCII, a NonPrintableDefect is added to the token's defects list. Since 1171 quoted pairs are converted to their unquoted values, what is returned is 1172 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value 1173 is ' '. 1174 1175 """ 1176 ptext, value, _ = _get_ptext_to_endchars(value, '()') 1177 ptext = WhiteSpaceTerminal(ptext, 'ptext') 1178 _validate_xtext(ptext) 1179 return ptext, value 1180 1181def get_qcontent(value): 1182 """qcontent = qtext / quoted-pair 1183 1184 We allow anything except the DQUOTE character, but if we find any ASCII 1185 other than the RFC defined printable ASCII, a NonPrintableDefect is 1186 added to the token's defects list. Any quoted pairs are converted to their 1187 unquoted values, so what is returned is a 'ptext' token. In this case it 1188 is a ValueTerminal. 1189 1190 """ 1191 ptext, value, _ = _get_ptext_to_endchars(value, '"') 1192 ptext = ValueTerminal(ptext, 'ptext') 1193 _validate_xtext(ptext) 1194 return ptext, value 1195 1196def get_atext(value): 1197 """atext = <matches _atext_matcher> 1198 1199 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to 1200 the token's defects list if we find non-atext characters. 1201 """ 1202 m = _non_atom_end_matcher(value) 1203 if not m: 1204 raise errors.HeaderParseError( 1205 "expected atext but found '{}'".format(value)) 1206 atext = m.group() 1207 value = value[len(atext):] 1208 atext = ValueTerminal(atext, 'atext') 1209 _validate_xtext(atext) 1210 return atext, value 1211 1212def get_bare_quoted_string(value): 1213 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE 1214 1215 A quoted-string without the leading or trailing white space. Its 1216 value is the text between the quote marks, with whitespace 1217 preserved and quoted pairs decoded. 1218 """ 1219 if not value or value[0] != '"': 1220 raise errors.HeaderParseError( 1221 "expected '\"' but found '{}'".format(value)) 1222 bare_quoted_string = BareQuotedString() 1223 value = value[1:] 1224 if value and value[0] == '"': 1225 token, value = get_qcontent(value) 1226 bare_quoted_string.append(token) 1227 while value and value[0] != '"': 1228 if value[0] in WSP: 1229 token, value = get_fws(value) 1230 elif value[:2] == '=?': 1231 valid_ew = False 1232 try: 1233 token, value = get_encoded_word(value) 1234 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1235 "encoded word inside quoted string")) 1236 valid_ew = True 1237 except errors.HeaderParseError: 1238 token, value = get_qcontent(value) 1239 # Collapse the whitespace between two encoded words that occur in a 1240 # bare-quoted-string. 1241 if valid_ew and len(bare_quoted_string) > 1: 1242 if (bare_quoted_string[-1].token_type == 'fws' and 1243 bare_quoted_string[-2].token_type == 'encoded-word'): 1244 bare_quoted_string[-1] = EWWhiteSpaceTerminal( 1245 bare_quoted_string[-1], 'fws') 1246 else: 1247 token, value = get_qcontent(value) 1248 bare_quoted_string.append(token) 1249 if not value: 1250 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1251 "end of header inside quoted string")) 1252 return bare_quoted_string, value 1253 return bare_quoted_string, value[1:] 1254 1255def get_comment(value): 1256 """comment = "(" *([FWS] ccontent) [FWS] ")" 1257 ccontent = ctext / quoted-pair / comment 1258 1259 We handle nested comments here, and quoted-pair in our qp-ctext routine. 1260 """ 1261 if value and value[0] != '(': 1262 raise errors.HeaderParseError( 1263 "expected '(' but found '{}'".format(value)) 1264 comment = Comment() 1265 value = value[1:] 1266 while value and value[0] != ")": 1267 if value[0] in WSP: 1268 token, value = get_fws(value) 1269 elif value[0] == '(': 1270 token, value = get_comment(value) 1271 else: 1272 token, value = get_qp_ctext(value) 1273 comment.append(token) 1274 if not value: 1275 comment.defects.append(errors.InvalidHeaderDefect( 1276 "end of header inside comment")) 1277 return comment, value 1278 return comment, value[1:] 1279 1280def get_cfws(value): 1281 """CFWS = (1*([FWS] comment) [FWS]) / FWS 1282 1283 """ 1284 cfws = CFWSList() 1285 while value and value[0] in CFWS_LEADER: 1286 if value[0] in WSP: 1287 token, value = get_fws(value) 1288 else: 1289 token, value = get_comment(value) 1290 cfws.append(token) 1291 return cfws, value 1292 1293def get_quoted_string(value): 1294 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] 1295 1296 'bare-quoted-string' is an intermediate class defined by this 1297 parser and not by the RFC grammar. It is the quoted string 1298 without any attached CFWS. 1299 """ 1300 quoted_string = QuotedString() 1301 if value and value[0] in CFWS_LEADER: 1302 token, value = get_cfws(value) 1303 quoted_string.append(token) 1304 token, value = get_bare_quoted_string(value) 1305 quoted_string.append(token) 1306 if value and value[0] in CFWS_LEADER: 1307 token, value = get_cfws(value) 1308 quoted_string.append(token) 1309 return quoted_string, value 1310 1311def get_atom(value): 1312 """atom = [CFWS] 1*atext [CFWS] 1313 1314 An atom could be an rfc2047 encoded word. 1315 """ 1316 atom = Atom() 1317 if value and value[0] in CFWS_LEADER: 1318 token, value = get_cfws(value) 1319 atom.append(token) 1320 if value and value[0] in ATOM_ENDS: 1321 raise errors.HeaderParseError( 1322 "expected atom but found '{}'".format(value)) 1323 if value.startswith('=?'): 1324 try: 1325 token, value = get_encoded_word(value) 1326 except errors.HeaderParseError: 1327 # XXX: need to figure out how to register defects when 1328 # appropriate here. 1329 token, value = get_atext(value) 1330 else: 1331 token, value = get_atext(value) 1332 atom.append(token) 1333 if value and value[0] in CFWS_LEADER: 1334 token, value = get_cfws(value) 1335 atom.append(token) 1336 return atom, value 1337 1338def get_dot_atom_text(value): 1339 """ dot-text = 1*atext *("." 1*atext) 1340 1341 """ 1342 dot_atom_text = DotAtomText() 1343 if not value or value[0] in ATOM_ENDS: 1344 raise errors.HeaderParseError("expected atom at a start of " 1345 "dot-atom-text but found '{}'".format(value)) 1346 while value and value[0] not in ATOM_ENDS: 1347 token, value = get_atext(value) 1348 dot_atom_text.append(token) 1349 if value and value[0] == '.': 1350 dot_atom_text.append(DOT) 1351 value = value[1:] 1352 if dot_atom_text[-1] is DOT: 1353 raise errors.HeaderParseError("expected atom at end of dot-atom-text " 1354 "but found '{}'".format('.'+value)) 1355 return dot_atom_text, value 1356 1357def get_dot_atom(value): 1358 """ dot-atom = [CFWS] dot-atom-text [CFWS] 1359 1360 Any place we can have a dot atom, we could instead have an rfc2047 encoded 1361 word. 1362 """ 1363 dot_atom = DotAtom() 1364 if value[0] in CFWS_LEADER: 1365 token, value = get_cfws(value) 1366 dot_atom.append(token) 1367 if value.startswith('=?'): 1368 try: 1369 token, value = get_encoded_word(value) 1370 except errors.HeaderParseError: 1371 # XXX: need to figure out how to register defects when 1372 # appropriate here. 1373 token, value = get_dot_atom_text(value) 1374 else: 1375 token, value = get_dot_atom_text(value) 1376 dot_atom.append(token) 1377 if value and value[0] in CFWS_LEADER: 1378 token, value = get_cfws(value) 1379 dot_atom.append(token) 1380 return dot_atom, value 1381 1382def get_word(value): 1383 """word = atom / quoted-string 1384 1385 Either atom or quoted-string may start with CFWS. We have to peel off this 1386 CFWS first to determine which type of word to parse. Afterward we splice 1387 the leading CFWS, if any, into the parsed sub-token. 1388 1389 If neither an atom or a quoted-string is found before the next special, a 1390 HeaderParseError is raised. 1391 1392 The token returned is either an Atom or a QuotedString, as appropriate. 1393 This means the 'word' level of the formal grammar is not represented in the 1394 parse tree; this is because having that extra layer when manipulating the 1395 parse tree is more confusing than it is helpful. 1396 1397 """ 1398 if value[0] in CFWS_LEADER: 1399 leader, value = get_cfws(value) 1400 else: 1401 leader = None 1402 if not value: 1403 raise errors.HeaderParseError( 1404 "Expected 'atom' or 'quoted-string' but found nothing.") 1405 if value[0]=='"': 1406 token, value = get_quoted_string(value) 1407 elif value[0] in SPECIALS: 1408 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " 1409 "but found '{}'".format(value)) 1410 else: 1411 token, value = get_atom(value) 1412 if leader is not None: 1413 token[:0] = [leader] 1414 return token, value 1415 1416def get_phrase(value): 1417 """ phrase = 1*word / obs-phrase 1418 obs-phrase = word *(word / "." / CFWS) 1419 1420 This means a phrase can be a sequence of words, periods, and CFWS in any 1421 order as long as it starts with at least one word. If anything other than 1422 words is detected, an ObsoleteHeaderDefect is added to the token's defect 1423 list. We also accept a phrase that starts with CFWS followed by a dot; 1424 this is registered as an InvalidHeaderDefect, since it is not supported by 1425 even the obsolete grammar. 1426 1427 """ 1428 phrase = Phrase() 1429 try: 1430 token, value = get_word(value) 1431 phrase.append(token) 1432 except errors.HeaderParseError: 1433 phrase.defects.append(errors.InvalidHeaderDefect( 1434 "phrase does not start with word")) 1435 while value and value[0] not in PHRASE_ENDS: 1436 if value[0]=='.': 1437 phrase.append(DOT) 1438 phrase.defects.append(errors.ObsoleteHeaderDefect( 1439 "period in 'phrase'")) 1440 value = value[1:] 1441 else: 1442 try: 1443 token, value = get_word(value) 1444 except errors.HeaderParseError: 1445 if value[0] in CFWS_LEADER: 1446 token, value = get_cfws(value) 1447 phrase.defects.append(errors.ObsoleteHeaderDefect( 1448 "comment found without atom")) 1449 else: 1450 raise 1451 phrase.append(token) 1452 return phrase, value 1453 1454def get_local_part(value): 1455 """ local-part = dot-atom / quoted-string / obs-local-part 1456 1457 """ 1458 local_part = LocalPart() 1459 leader = None 1460 if value and value[0] in CFWS_LEADER: 1461 leader, value = get_cfws(value) 1462 if not value: 1463 raise errors.HeaderParseError( 1464 "expected local-part but found '{}'".format(value)) 1465 try: 1466 token, value = get_dot_atom(value) 1467 except errors.HeaderParseError: 1468 try: 1469 token, value = get_word(value) 1470 except errors.HeaderParseError: 1471 if value[0] != '\\' and value[0] in PHRASE_ENDS: 1472 raise 1473 token = TokenList() 1474 if leader is not None: 1475 token[:0] = [leader] 1476 local_part.append(token) 1477 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1478 obs_local_part, value = get_obs_local_part(str(local_part) + value) 1479 if obs_local_part.token_type == 'invalid-obs-local-part': 1480 local_part.defects.append(errors.InvalidHeaderDefect( 1481 "local-part is not dot-atom, quoted-string, or obs-local-part")) 1482 else: 1483 local_part.defects.append(errors.ObsoleteHeaderDefect( 1484 "local-part is not a dot-atom (contains CFWS)")) 1485 local_part[0] = obs_local_part 1486 try: 1487 local_part.value.encode('ascii') 1488 except UnicodeEncodeError: 1489 local_part.defects.append(errors.NonASCIILocalPartDefect( 1490 "local-part contains non-ASCII characters)")) 1491 return local_part, value 1492 1493def get_obs_local_part(value): 1494 """ obs-local-part = word *("." word) 1495 """ 1496 obs_local_part = ObsLocalPart() 1497 last_non_ws_was_dot = False 1498 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1499 if value[0] == '.': 1500 if last_non_ws_was_dot: 1501 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1502 "invalid repeated '.'")) 1503 obs_local_part.append(DOT) 1504 last_non_ws_was_dot = True 1505 value = value[1:] 1506 continue 1507 elif value[0]=='\\': 1508 obs_local_part.append(ValueTerminal(value[0], 1509 'misplaced-special')) 1510 value = value[1:] 1511 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1512 "'\\' character outside of quoted-string/ccontent")) 1513 last_non_ws_was_dot = False 1514 continue 1515 if obs_local_part and obs_local_part[-1].token_type != 'dot': 1516 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1517 "missing '.' between words")) 1518 try: 1519 token, value = get_word(value) 1520 last_non_ws_was_dot = False 1521 except errors.HeaderParseError: 1522 if value[0] not in CFWS_LEADER: 1523 raise 1524 token, value = get_cfws(value) 1525 obs_local_part.append(token) 1526 if not obs_local_part: 1527 raise errors.HeaderParseError( 1528 "expected obs-local-part but found '{}'".format(value)) 1529 if (obs_local_part[0].token_type == 'dot' or 1530 obs_local_part[0].token_type=='cfws' and 1531 len(obs_local_part) > 1 and 1532 obs_local_part[1].token_type=='dot'): 1533 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1534 "Invalid leading '.' in local part")) 1535 if (obs_local_part[-1].token_type == 'dot' or 1536 obs_local_part[-1].token_type=='cfws' and 1537 len(obs_local_part) > 1 and 1538 obs_local_part[-2].token_type=='dot'): 1539 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1540 "Invalid trailing '.' in local part")) 1541 if obs_local_part.defects: 1542 obs_local_part.token_type = 'invalid-obs-local-part' 1543 return obs_local_part, value 1544 1545def get_dtext(value): 1546 r""" dtext = <printable ascii except \ [ ]> / obs-dtext 1547 obs-dtext = obs-NO-WS-CTL / quoted-pair 1548 1549 We allow anything except the excluded characters, but if we find any 1550 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is 1551 added to the token's defects list. Quoted pairs are converted to their 1552 unquoted values, so what is returned is a ptext token, in this case a 1553 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is 1554 added to the returned token's defect list. 1555 1556 """ 1557 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') 1558 ptext = ValueTerminal(ptext, 'ptext') 1559 if had_qp: 1560 ptext.defects.append(errors.ObsoleteHeaderDefect( 1561 "quoted printable found in domain-literal")) 1562 _validate_xtext(ptext) 1563 return ptext, value 1564 1565def _check_for_early_dl_end(value, domain_literal): 1566 if value: 1567 return False 1568 domain_literal.append(errors.InvalidHeaderDefect( 1569 "end of input inside domain-literal")) 1570 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1571 return True 1572 1573def get_domain_literal(value): 1574 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] 1575 1576 """ 1577 domain_literal = DomainLiteral() 1578 if value[0] in CFWS_LEADER: 1579 token, value = get_cfws(value) 1580 domain_literal.append(token) 1581 if not value: 1582 raise errors.HeaderParseError("expected domain-literal") 1583 if value[0] != '[': 1584 raise errors.HeaderParseError("expected '[' at start of domain-literal " 1585 "but found '{}'".format(value)) 1586 value = value[1:] 1587 if _check_for_early_dl_end(value, domain_literal): 1588 return domain_literal, value 1589 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) 1590 if value[0] in WSP: 1591 token, value = get_fws(value) 1592 domain_literal.append(token) 1593 token, value = get_dtext(value) 1594 domain_literal.append(token) 1595 if _check_for_early_dl_end(value, domain_literal): 1596 return domain_literal, value 1597 if value[0] in WSP: 1598 token, value = get_fws(value) 1599 domain_literal.append(token) 1600 if _check_for_early_dl_end(value, domain_literal): 1601 return domain_literal, value 1602 if value[0] != ']': 1603 raise errors.HeaderParseError("expected ']' at end of domain-literal " 1604 "but found '{}'".format(value)) 1605 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1606 value = value[1:] 1607 if value and value[0] in CFWS_LEADER: 1608 token, value = get_cfws(value) 1609 domain_literal.append(token) 1610 return domain_literal, value 1611 1612def get_domain(value): 1613 """ domain = dot-atom / domain-literal / obs-domain 1614 obs-domain = atom *("." atom)) 1615 1616 """ 1617 domain = Domain() 1618 leader = None 1619 if value and value[0] in CFWS_LEADER: 1620 leader, value = get_cfws(value) 1621 if not value: 1622 raise errors.HeaderParseError( 1623 "expected domain but found '{}'".format(value)) 1624 if value[0] == '[': 1625 token, value = get_domain_literal(value) 1626 if leader is not None: 1627 token[:0] = [leader] 1628 domain.append(token) 1629 return domain, value 1630 try: 1631 token, value = get_dot_atom(value) 1632 except errors.HeaderParseError: 1633 token, value = get_atom(value) 1634 if value and value[0] == '@': 1635 raise errors.HeaderParseError('Invalid Domain') 1636 if leader is not None: 1637 token[:0] = [leader] 1638 domain.append(token) 1639 if value and value[0] == '.': 1640 domain.defects.append(errors.ObsoleteHeaderDefect( 1641 "domain is not a dot-atom (contains CFWS)")) 1642 if domain[0].token_type == 'dot-atom': 1643 domain[:] = domain[0] 1644 while value and value[0] == '.': 1645 domain.append(DOT) 1646 token, value = get_atom(value[1:]) 1647 domain.append(token) 1648 return domain, value 1649 1650def get_addr_spec(value): 1651 """ addr-spec = local-part "@" domain 1652 1653 """ 1654 addr_spec = AddrSpec() 1655 token, value = get_local_part(value) 1656 addr_spec.append(token) 1657 if not value or value[0] != '@': 1658 addr_spec.defects.append(errors.InvalidHeaderDefect( 1659 "addr-spec local part with no domain")) 1660 return addr_spec, value 1661 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) 1662 token, value = get_domain(value[1:]) 1663 addr_spec.append(token) 1664 return addr_spec, value 1665 1666def get_obs_route(value): 1667 """ obs-route = obs-domain-list ":" 1668 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) 1669 1670 Returns an obs-route token with the appropriate sub-tokens (that is, 1671 there is no obs-domain-list in the parse tree). 1672 """ 1673 obs_route = ObsRoute() 1674 while value and (value[0]==',' or value[0] in CFWS_LEADER): 1675 if value[0] in CFWS_LEADER: 1676 token, value = get_cfws(value) 1677 obs_route.append(token) 1678 elif value[0] == ',': 1679 obs_route.append(ListSeparator) 1680 value = value[1:] 1681 if not value or value[0] != '@': 1682 raise errors.HeaderParseError( 1683 "expected obs-route domain but found '{}'".format(value)) 1684 obs_route.append(RouteComponentMarker) 1685 token, value = get_domain(value[1:]) 1686 obs_route.append(token) 1687 while value and value[0]==',': 1688 obs_route.append(ListSeparator) 1689 value = value[1:] 1690 if not value: 1691 break 1692 if value[0] in CFWS_LEADER: 1693 token, value = get_cfws(value) 1694 obs_route.append(token) 1695 if not value: 1696 break 1697 if value[0] == '@': 1698 obs_route.append(RouteComponentMarker) 1699 token, value = get_domain(value[1:]) 1700 obs_route.append(token) 1701 if not value: 1702 raise errors.HeaderParseError("end of header while parsing obs-route") 1703 if value[0] != ':': 1704 raise errors.HeaderParseError( "expected ':' marking end of " 1705 "obs-route but found '{}'".format(value)) 1706 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) 1707 return obs_route, value[1:] 1708 1709def get_angle_addr(value): 1710 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr 1711 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] 1712 1713 """ 1714 angle_addr = AngleAddr() 1715 if value and value[0] in CFWS_LEADER: 1716 token, value = get_cfws(value) 1717 angle_addr.append(token) 1718 if not value or value[0] != '<': 1719 raise errors.HeaderParseError( 1720 "expected angle-addr but found '{}'".format(value)) 1721 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) 1722 value = value[1:] 1723 # Although it is not legal per RFC5322, SMTP uses '<>' in certain 1724 # circumstances. 1725 if value and value[0] == '>': 1726 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1727 angle_addr.defects.append(errors.InvalidHeaderDefect( 1728 "null addr-spec in angle-addr")) 1729 value = value[1:] 1730 return angle_addr, value 1731 try: 1732 token, value = get_addr_spec(value) 1733 except errors.HeaderParseError: 1734 try: 1735 token, value = get_obs_route(value) 1736 angle_addr.defects.append(errors.ObsoleteHeaderDefect( 1737 "obsolete route specification in angle-addr")) 1738 except errors.HeaderParseError: 1739 raise errors.HeaderParseError( 1740 "expected addr-spec or obs-route but found '{}'".format(value)) 1741 angle_addr.append(token) 1742 token, value = get_addr_spec(value) 1743 angle_addr.append(token) 1744 if value and value[0] == '>': 1745 value = value[1:] 1746 else: 1747 angle_addr.defects.append(errors.InvalidHeaderDefect( 1748 "missing trailing '>' on angle-addr")) 1749 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1750 if value and value[0] in CFWS_LEADER: 1751 token, value = get_cfws(value) 1752 angle_addr.append(token) 1753 return angle_addr, value 1754 1755def get_display_name(value): 1756 """ display-name = phrase 1757 1758 Because this is simply a name-rule, we don't return a display-name 1759 token containing a phrase, but rather a display-name token with 1760 the content of the phrase. 1761 1762 """ 1763 display_name = DisplayName() 1764 token, value = get_phrase(value) 1765 display_name.extend(token[:]) 1766 display_name.defects = token.defects[:] 1767 return display_name, value 1768 1769 1770def get_name_addr(value): 1771 """ name-addr = [display-name] angle-addr 1772 1773 """ 1774 name_addr = NameAddr() 1775 # Both the optional display name and the angle-addr can start with cfws. 1776 leader = None 1777 if not value: 1778 raise errors.HeaderParseError( 1779 "expected name-addr but found '{}'".format(value)) 1780 if value[0] in CFWS_LEADER: 1781 leader, value = get_cfws(value) 1782 if not value: 1783 raise errors.HeaderParseError( 1784 "expected name-addr but found '{}'".format(leader)) 1785 if value[0] != '<': 1786 if value[0] in PHRASE_ENDS: 1787 raise errors.HeaderParseError( 1788 "expected name-addr but found '{}'".format(value)) 1789 token, value = get_display_name(value) 1790 if not value: 1791 raise errors.HeaderParseError( 1792 "expected name-addr but found '{}'".format(token)) 1793 if leader is not None: 1794 if isinstance(token[0], TokenList): 1795 token[0][:0] = [leader] 1796 else: 1797 token[:0] = [leader] 1798 leader = None 1799 name_addr.append(token) 1800 token, value = get_angle_addr(value) 1801 if leader is not None: 1802 token[:0] = [leader] 1803 name_addr.append(token) 1804 return name_addr, value 1805 1806def get_mailbox(value): 1807 """ mailbox = name-addr / addr-spec 1808 1809 """ 1810 # The only way to figure out if we are dealing with a name-addr or an 1811 # addr-spec is to try parsing each one. 1812 mailbox = Mailbox() 1813 try: 1814 token, value = get_name_addr(value) 1815 except errors.HeaderParseError: 1816 try: 1817 token, value = get_addr_spec(value) 1818 except errors.HeaderParseError: 1819 raise errors.HeaderParseError( 1820 "expected mailbox but found '{}'".format(value)) 1821 if any(isinstance(x, errors.InvalidHeaderDefect) 1822 for x in token.all_defects): 1823 mailbox.token_type = 'invalid-mailbox' 1824 mailbox.append(token) 1825 return mailbox, value 1826 1827def get_invalid_mailbox(value, endchars): 1828 """ Read everything up to one of the chars in endchars. 1829 1830 This is outside the formal grammar. The InvalidMailbox TokenList that is 1831 returned acts like a Mailbox, but the data attributes are None. 1832 1833 """ 1834 invalid_mailbox = InvalidMailbox() 1835 while value and value[0] not in endchars: 1836 if value[0] in PHRASE_ENDS: 1837 invalid_mailbox.append(ValueTerminal(value[0], 1838 'misplaced-special')) 1839 value = value[1:] 1840 else: 1841 token, value = get_phrase(value) 1842 invalid_mailbox.append(token) 1843 return invalid_mailbox, value 1844 1845def get_mailbox_list(value): 1846 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 1847 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) 1848 1849 For this routine we go outside the formal grammar in order to improve error 1850 handling. We recognize the end of the mailbox list only at the end of the 1851 value or at a ';' (the group terminator). This is so that we can turn 1852 invalid mailboxes into InvalidMailbox tokens and continue parsing any 1853 remaining valid mailboxes. We also allow all mailbox entries to be null, 1854 and this condition is handled appropriately at a higher level. 1855 1856 """ 1857 mailbox_list = MailboxList() 1858 while value and value[0] != ';': 1859 try: 1860 token, value = get_mailbox(value) 1861 mailbox_list.append(token) 1862 except errors.HeaderParseError: 1863 leader = None 1864 if value[0] in CFWS_LEADER: 1865 leader, value = get_cfws(value) 1866 if not value or value[0] in ',;': 1867 mailbox_list.append(leader) 1868 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1869 "empty element in mailbox-list")) 1870 else: 1871 token, value = get_invalid_mailbox(value, ',;') 1872 if leader is not None: 1873 token[:0] = [leader] 1874 mailbox_list.append(token) 1875 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1876 "invalid mailbox in mailbox-list")) 1877 elif value[0] == ',': 1878 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1879 "empty element in mailbox-list")) 1880 else: 1881 token, value = get_invalid_mailbox(value, ',;') 1882 if leader is not None: 1883 token[:0] = [leader] 1884 mailbox_list.append(token) 1885 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1886 "invalid mailbox in mailbox-list")) 1887 if value and value[0] not in ',;': 1888 # Crap after mailbox; treat it as an invalid mailbox. 1889 # The mailbox info will still be available. 1890 mailbox = mailbox_list[-1] 1891 mailbox.token_type = 'invalid-mailbox' 1892 token, value = get_invalid_mailbox(value, ',;') 1893 mailbox.extend(token) 1894 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1895 "invalid mailbox in mailbox-list")) 1896 if value and value[0] == ',': 1897 mailbox_list.append(ListSeparator) 1898 value = value[1:] 1899 return mailbox_list, value 1900 1901 1902def get_group_list(value): 1903 """ group-list = mailbox-list / CFWS / obs-group-list 1904 obs-group-list = 1*([CFWS] ",") [CFWS] 1905 1906 """ 1907 group_list = GroupList() 1908 if not value: 1909 group_list.defects.append(errors.InvalidHeaderDefect( 1910 "end of header before group-list")) 1911 return group_list, value 1912 leader = None 1913 if value and value[0] in CFWS_LEADER: 1914 leader, value = get_cfws(value) 1915 if not value: 1916 # This should never happen in email parsing, since CFWS-only is a 1917 # legal alternative to group-list in a group, which is the only 1918 # place group-list appears. 1919 group_list.defects.append(errors.InvalidHeaderDefect( 1920 "end of header in group-list")) 1921 group_list.append(leader) 1922 return group_list, value 1923 if value[0] == ';': 1924 group_list.append(leader) 1925 return group_list, value 1926 token, value = get_mailbox_list(value) 1927 if len(token.all_mailboxes)==0: 1928 if leader is not None: 1929 group_list.append(leader) 1930 group_list.extend(token) 1931 group_list.defects.append(errors.ObsoleteHeaderDefect( 1932 "group-list with empty entries")) 1933 return group_list, value 1934 if leader is not None: 1935 token[:0] = [leader] 1936 group_list.append(token) 1937 return group_list, value 1938 1939def get_group(value): 1940 """ group = display-name ":" [group-list] ";" [CFWS] 1941 1942 """ 1943 group = Group() 1944 token, value = get_display_name(value) 1945 if not value or value[0] != ':': 1946 raise errors.HeaderParseError("expected ':' at end of group " 1947 "display name but found '{}'".format(value)) 1948 group.append(token) 1949 group.append(ValueTerminal(':', 'group-display-name-terminator')) 1950 value = value[1:] 1951 if value and value[0] == ';': 1952 group.append(ValueTerminal(';', 'group-terminator')) 1953 return group, value[1:] 1954 token, value = get_group_list(value) 1955 group.append(token) 1956 if not value: 1957 group.defects.append(errors.InvalidHeaderDefect( 1958 "end of header in group")) 1959 elif value[0] != ';': 1960 raise errors.HeaderParseError( 1961 "expected ';' at end of group but found {}".format(value)) 1962 group.append(ValueTerminal(';', 'group-terminator')) 1963 value = value[1:] 1964 if value and value[0] in CFWS_LEADER: 1965 token, value = get_cfws(value) 1966 group.append(token) 1967 return group, value 1968 1969def get_address(value): 1970 """ address = mailbox / group 1971 1972 Note that counter-intuitively, an address can be either a single address or 1973 a list of addresses (a group). This is why the returned Address object has 1974 a 'mailboxes' attribute which treats a single address as a list of length 1975 one. When you need to differentiate between to two cases, extract the single 1976 element, which is either a mailbox or a group token. 1977 1978 """ 1979 # The formal grammar isn't very helpful when parsing an address. mailbox 1980 # and group, especially when allowing for obsolete forms, start off very 1981 # similarly. It is only when you reach one of @, <, or : that you know 1982 # what you've got. So, we try each one in turn, starting with the more 1983 # likely of the two. We could perhaps make this more efficient by looking 1984 # for a phrase and then branching based on the next character, but that 1985 # would be a premature optimization. 1986 address = Address() 1987 try: 1988 token, value = get_group(value) 1989 except errors.HeaderParseError: 1990 try: 1991 token, value = get_mailbox(value) 1992 except errors.HeaderParseError: 1993 raise errors.HeaderParseError( 1994 "expected address but found '{}'".format(value)) 1995 address.append(token) 1996 return address, value 1997 1998def get_address_list(value): 1999 """ address_list = (address *("," address)) / obs-addr-list 2000 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) 2001 2002 We depart from the formal grammar here by continuing to parse until the end 2003 of the input, assuming the input to be entirely composed of an 2004 address-list. This is always true in email parsing, and allows us 2005 to skip invalid addresses to parse additional valid ones. 2006 2007 """ 2008 address_list = AddressList() 2009 while value: 2010 try: 2011 token, value = get_address(value) 2012 address_list.append(token) 2013 except errors.HeaderParseError: 2014 leader = None 2015 if value[0] in CFWS_LEADER: 2016 leader, value = get_cfws(value) 2017 if not value or value[0] == ',': 2018 address_list.append(leader) 2019 address_list.defects.append(errors.ObsoleteHeaderDefect( 2020 "address-list entry with no content")) 2021 else: 2022 token, value = get_invalid_mailbox(value, ',') 2023 if leader is not None: 2024 token[:0] = [leader] 2025 address_list.append(Address([token])) 2026 address_list.defects.append(errors.InvalidHeaderDefect( 2027 "invalid address in address-list")) 2028 elif value[0] == ',': 2029 address_list.defects.append(errors.ObsoleteHeaderDefect( 2030 "empty element in address-list")) 2031 else: 2032 token, value = get_invalid_mailbox(value, ',') 2033 if leader is not None: 2034 token[:0] = [leader] 2035 address_list.append(Address([token])) 2036 address_list.defects.append(errors.InvalidHeaderDefect( 2037 "invalid address in address-list")) 2038 if value and value[0] != ',': 2039 # Crap after address; treat it as an invalid mailbox. 2040 # The mailbox info will still be available. 2041 mailbox = address_list[-1][0] 2042 mailbox.token_type = 'invalid-mailbox' 2043 token, value = get_invalid_mailbox(value, ',') 2044 mailbox.extend(token) 2045 address_list.defects.append(errors.InvalidHeaderDefect( 2046 "invalid address in address-list")) 2047 if value: # Must be a , at this point. 2048 address_list.append(ListSeparator) 2049 value = value[1:] 2050 return address_list, value 2051 2052 2053def get_no_fold_literal(value): 2054 """ no-fold-literal = "[" *dtext "]" 2055 """ 2056 no_fold_literal = NoFoldLiteral() 2057 if not value: 2058 raise errors.HeaderParseError( 2059 "expected no-fold-literal but found '{}'".format(value)) 2060 if value[0] != '[': 2061 raise errors.HeaderParseError( 2062 "expected '[' at the start of no-fold-literal " 2063 "but found '{}'".format(value)) 2064 no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start')) 2065 value = value[1:] 2066 token, value = get_dtext(value) 2067 no_fold_literal.append(token) 2068 if not value or value[0] != ']': 2069 raise errors.HeaderParseError( 2070 "expected ']' at the end of no-fold-literal " 2071 "but found '{}'".format(value)) 2072 no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end')) 2073 return no_fold_literal, value[1:] 2074 2075def get_msg_id(value): 2076 """msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS] 2077 id-left = dot-atom-text / obs-id-left 2078 id-right = dot-atom-text / no-fold-literal / obs-id-right 2079 no-fold-literal = "[" *dtext "]" 2080 """ 2081 msg_id = MsgID() 2082 if value and value[0] in CFWS_LEADER: 2083 token, value = get_cfws(value) 2084 msg_id.append(token) 2085 if not value or value[0] != '<': 2086 raise errors.HeaderParseError( 2087 "expected msg-id but found '{}'".format(value)) 2088 msg_id.append(ValueTerminal('<', 'msg-id-start')) 2089 value = value[1:] 2090 # Parse id-left. 2091 try: 2092 token, value = get_dot_atom_text(value) 2093 except errors.HeaderParseError: 2094 try: 2095 # obs-id-left is same as local-part of add-spec. 2096 token, value = get_obs_local_part(value) 2097 msg_id.defects.append(errors.ObsoleteHeaderDefect( 2098 "obsolete id-left in msg-id")) 2099 except errors.HeaderParseError: 2100 raise errors.HeaderParseError( 2101 "expected dot-atom-text or obs-id-left" 2102 " but found '{}'".format(value)) 2103 msg_id.append(token) 2104 if not value or value[0] != '@': 2105 msg_id.defects.append(errors.InvalidHeaderDefect( 2106 "msg-id with no id-right")) 2107 # Even though there is no id-right, if the local part 2108 # ends with `>` let's just parse it too and return 2109 # along with the defect. 2110 if value and value[0] == '>': 2111 msg_id.append(ValueTerminal('>', 'msg-id-end')) 2112 value = value[1:] 2113 return msg_id, value 2114 msg_id.append(ValueTerminal('@', 'address-at-symbol')) 2115 value = value[1:] 2116 # Parse id-right. 2117 try: 2118 token, value = get_dot_atom_text(value) 2119 except errors.HeaderParseError: 2120 try: 2121 token, value = get_no_fold_literal(value) 2122 except errors.HeaderParseError: 2123 try: 2124 token, value = get_domain(value) 2125 msg_id.defects.append(errors.ObsoleteHeaderDefect( 2126 "obsolete id-right in msg-id")) 2127 except errors.HeaderParseError: 2128 raise errors.HeaderParseError( 2129 "expected dot-atom-text, no-fold-literal or obs-id-right" 2130 " but found '{}'".format(value)) 2131 msg_id.append(token) 2132 if value and value[0] == '>': 2133 value = value[1:] 2134 else: 2135 msg_id.defects.append(errors.InvalidHeaderDefect( 2136 "missing trailing '>' on msg-id")) 2137 msg_id.append(ValueTerminal('>', 'msg-id-end')) 2138 if value and value[0] in CFWS_LEADER: 2139 token, value = get_cfws(value) 2140 msg_id.append(token) 2141 return msg_id, value 2142 2143 2144def parse_message_id(value): 2145 """message-id = "Message-ID:" msg-id CRLF 2146 """ 2147 message_id = MessageID() 2148 try: 2149 token, value = get_msg_id(value) 2150 message_id.append(token) 2151 except errors.HeaderParseError as ex: 2152 token = get_unstructured(value) 2153 message_id = InvalidMessageID(token) 2154 message_id.defects.append( 2155 errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex))) 2156 else: 2157 # Value after parsing a valid msg_id should be None. 2158 if value: 2159 message_id.defects.append(errors.InvalidHeaderDefect( 2160 "Unexpected {!r}".format(value))) 2161 2162 return message_id 2163 2164# 2165# XXX: As I begin to add additional header parsers, I'm realizing we probably 2166# have two level of parser routines: the get_XXX methods that get a token in 2167# the grammar, and parse_XXX methods that parse an entire field value. So 2168# get_address_list above should really be a parse_ method, as probably should 2169# be get_unstructured. 2170# 2171 2172def parse_mime_version(value): 2173 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] 2174 2175 """ 2176 # The [CFWS] is implicit in the RFC 2045 BNF. 2177 # XXX: This routine is a bit verbose, should factor out a get_int method. 2178 mime_version = MIMEVersion() 2179 if not value: 2180 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2181 "Missing MIME version number (eg: 1.0)")) 2182 return mime_version 2183 if value[0] in CFWS_LEADER: 2184 token, value = get_cfws(value) 2185 mime_version.append(token) 2186 if not value: 2187 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2188 "Expected MIME version number but found only CFWS")) 2189 digits = '' 2190 while value and value[0] != '.' and value[0] not in CFWS_LEADER: 2191 digits += value[0] 2192 value = value[1:] 2193 if not digits.isdigit(): 2194 mime_version.defects.append(errors.InvalidHeaderDefect( 2195 "Expected MIME major version number but found {!r}".format(digits))) 2196 mime_version.append(ValueTerminal(digits, 'xtext')) 2197 else: 2198 mime_version.major = int(digits) 2199 mime_version.append(ValueTerminal(digits, 'digits')) 2200 if value and value[0] in CFWS_LEADER: 2201 token, value = get_cfws(value) 2202 mime_version.append(token) 2203 if not value or value[0] != '.': 2204 if mime_version.major is not None: 2205 mime_version.defects.append(errors.InvalidHeaderDefect( 2206 "Incomplete MIME version; found only major number")) 2207 if value: 2208 mime_version.append(ValueTerminal(value, 'xtext')) 2209 return mime_version 2210 mime_version.append(ValueTerminal('.', 'version-separator')) 2211 value = value[1:] 2212 if value and value[0] in CFWS_LEADER: 2213 token, value = get_cfws(value) 2214 mime_version.append(token) 2215 if not value: 2216 if mime_version.major is not None: 2217 mime_version.defects.append(errors.InvalidHeaderDefect( 2218 "Incomplete MIME version; found only major number")) 2219 return mime_version 2220 digits = '' 2221 while value and value[0] not in CFWS_LEADER: 2222 digits += value[0] 2223 value = value[1:] 2224 if not digits.isdigit(): 2225 mime_version.defects.append(errors.InvalidHeaderDefect( 2226 "Expected MIME minor version number but found {!r}".format(digits))) 2227 mime_version.append(ValueTerminal(digits, 'xtext')) 2228 else: 2229 mime_version.minor = int(digits) 2230 mime_version.append(ValueTerminal(digits, 'digits')) 2231 if value and value[0] in CFWS_LEADER: 2232 token, value = get_cfws(value) 2233 mime_version.append(token) 2234 if value: 2235 mime_version.defects.append(errors.InvalidHeaderDefect( 2236 "Excess non-CFWS text after MIME version")) 2237 mime_version.append(ValueTerminal(value, 'xtext')) 2238 return mime_version 2239 2240def get_invalid_parameter(value): 2241 """ Read everything up to the next ';'. 2242 2243 This is outside the formal grammar. The InvalidParameter TokenList that is 2244 returned acts like a Parameter, but the data attributes are None. 2245 2246 """ 2247 invalid_parameter = InvalidParameter() 2248 while value and value[0] != ';': 2249 if value[0] in PHRASE_ENDS: 2250 invalid_parameter.append(ValueTerminal(value[0], 2251 'misplaced-special')) 2252 value = value[1:] 2253 else: 2254 token, value = get_phrase(value) 2255 invalid_parameter.append(token) 2256 return invalid_parameter, value 2257 2258def get_ttext(value): 2259 """ttext = <matches _ttext_matcher> 2260 2261 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's 2262 defects list if we find non-ttext characters. We also register defects for 2263 *any* non-printables even though the RFC doesn't exclude all of them, 2264 because we follow the spirit of RFC 5322. 2265 2266 """ 2267 m = _non_token_end_matcher(value) 2268 if not m: 2269 raise errors.HeaderParseError( 2270 "expected ttext but found '{}'".format(value)) 2271 ttext = m.group() 2272 value = value[len(ttext):] 2273 ttext = ValueTerminal(ttext, 'ttext') 2274 _validate_xtext(ttext) 2275 return ttext, value 2276 2277def get_token(value): 2278 """token = [CFWS] 1*ttext [CFWS] 2279 2280 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or 2281 tspecials. We also exclude tabs even though the RFC doesn't. 2282 2283 The RFC implies the CFWS but is not explicit about it in the BNF. 2284 2285 """ 2286 mtoken = Token() 2287 if value and value[0] in CFWS_LEADER: 2288 token, value = get_cfws(value) 2289 mtoken.append(token) 2290 if value and value[0] in TOKEN_ENDS: 2291 raise errors.HeaderParseError( 2292 "expected token but found '{}'".format(value)) 2293 token, value = get_ttext(value) 2294 mtoken.append(token) 2295 if value and value[0] in CFWS_LEADER: 2296 token, value = get_cfws(value) 2297 mtoken.append(token) 2298 return mtoken, value 2299 2300def get_attrtext(value): 2301 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) 2302 2303 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the 2304 token's defects list if we find non-attrtext characters. We also register 2305 defects for *any* non-printables even though the RFC doesn't exclude all of 2306 them, because we follow the spirit of RFC 5322. 2307 2308 """ 2309 m = _non_attribute_end_matcher(value) 2310 if not m: 2311 raise errors.HeaderParseError( 2312 "expected attrtext but found {!r}".format(value)) 2313 attrtext = m.group() 2314 value = value[len(attrtext):] 2315 attrtext = ValueTerminal(attrtext, 'attrtext') 2316 _validate_xtext(attrtext) 2317 return attrtext, value 2318 2319def get_attribute(value): 2320 """ [CFWS] 1*attrtext [CFWS] 2321 2322 This version of the BNF makes the CFWS explicit, and as usual we use a 2323 value terminal for the actual run of characters. The RFC equivalent of 2324 attrtext is the token characters, with the subtraction of '*', "'", and '%'. 2325 We include tab in the excluded set just as we do for token. 2326 2327 """ 2328 attribute = Attribute() 2329 if value and value[0] in CFWS_LEADER: 2330 token, value = get_cfws(value) 2331 attribute.append(token) 2332 if value and value[0] in ATTRIBUTE_ENDS: 2333 raise errors.HeaderParseError( 2334 "expected token but found '{}'".format(value)) 2335 token, value = get_attrtext(value) 2336 attribute.append(token) 2337 if value and value[0] in CFWS_LEADER: 2338 token, value = get_cfws(value) 2339 attribute.append(token) 2340 return attribute, value 2341 2342def get_extended_attrtext(value): 2343 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') 2344 2345 This is a special parsing routine so that we get a value that 2346 includes % escapes as a single string (which we decode as a single 2347 string later). 2348 2349 """ 2350 m = _non_extended_attribute_end_matcher(value) 2351 if not m: 2352 raise errors.HeaderParseError( 2353 "expected extended attrtext but found {!r}".format(value)) 2354 attrtext = m.group() 2355 value = value[len(attrtext):] 2356 attrtext = ValueTerminal(attrtext, 'extended-attrtext') 2357 _validate_xtext(attrtext) 2358 return attrtext, value 2359 2360def get_extended_attribute(value): 2361 """ [CFWS] 1*extended_attrtext [CFWS] 2362 2363 This is like the non-extended version except we allow % characters, so that 2364 we can pick up an encoded value as a single string. 2365 2366 """ 2367 # XXX: should we have an ExtendedAttribute TokenList? 2368 attribute = Attribute() 2369 if value and value[0] in CFWS_LEADER: 2370 token, value = get_cfws(value) 2371 attribute.append(token) 2372 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: 2373 raise errors.HeaderParseError( 2374 "expected token but found '{}'".format(value)) 2375 token, value = get_extended_attrtext(value) 2376 attribute.append(token) 2377 if value and value[0] in CFWS_LEADER: 2378 token, value = get_cfws(value) 2379 attribute.append(token) 2380 return attribute, value 2381 2382def get_section(value): 2383 """ '*' digits 2384 2385 The formal BNF is more complicated because leading 0s are not allowed. We 2386 check for that and add a defect. We also assume no CFWS is allowed between 2387 the '*' and the digits, though the RFC is not crystal clear on that. 2388 The caller should already have dealt with leading CFWS. 2389 2390 """ 2391 section = Section() 2392 if not value or value[0] != '*': 2393 raise errors.HeaderParseError("Expected section but found {}".format( 2394 value)) 2395 section.append(ValueTerminal('*', 'section-marker')) 2396 value = value[1:] 2397 if not value or not value[0].isdigit(): 2398 raise errors.HeaderParseError("Expected section number but " 2399 "found {}".format(value)) 2400 digits = '' 2401 while value and value[0].isdigit(): 2402 digits += value[0] 2403 value = value[1:] 2404 if digits[0] == '0' and digits != '0': 2405 section.defects.append(errors.InvalidHeaderDefect( 2406 "section number has an invalid leading 0")) 2407 section.number = int(digits) 2408 section.append(ValueTerminal(digits, 'digits')) 2409 return section, value 2410 2411 2412def get_value(value): 2413 """ quoted-string / attribute 2414 2415 """ 2416 v = Value() 2417 if not value: 2418 raise errors.HeaderParseError("Expected value but found end of string") 2419 leader = None 2420 if value[0] in CFWS_LEADER: 2421 leader, value = get_cfws(value) 2422 if not value: 2423 raise errors.HeaderParseError("Expected value but found " 2424 "only {}".format(leader)) 2425 if value[0] == '"': 2426 token, value = get_quoted_string(value) 2427 else: 2428 token, value = get_extended_attribute(value) 2429 if leader is not None: 2430 token[:0] = [leader] 2431 v.append(token) 2432 return v, value 2433 2434def get_parameter(value): 2435 """ attribute [section] ["*"] [CFWS] "=" value 2436 2437 The CFWS is implied by the RFC but not made explicit in the BNF. This 2438 simplified form of the BNF from the RFC is made to conform with the RFC BNF 2439 through some extra checks. We do it this way because it makes both error 2440 recovery and working with the resulting parse tree easier. 2441 """ 2442 # It is possible CFWS would also be implicitly allowed between the section 2443 # and the 'extended-attribute' marker (the '*') , but we've never seen that 2444 # in the wild and we will therefore ignore the possibility. 2445 param = Parameter() 2446 token, value = get_attribute(value) 2447 param.append(token) 2448 if not value or value[0] == ';': 2449 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " 2450 "name ({}) but no value".format(token))) 2451 return param, value 2452 if value[0] == '*': 2453 try: 2454 token, value = get_section(value) 2455 param.sectioned = True 2456 param.append(token) 2457 except errors.HeaderParseError: 2458 pass 2459 if not value: 2460 raise errors.HeaderParseError("Incomplete parameter") 2461 if value[0] == '*': 2462 param.append(ValueTerminal('*', 'extended-parameter-marker')) 2463 value = value[1:] 2464 param.extended = True 2465 if value[0] != '=': 2466 raise errors.HeaderParseError("Parameter not followed by '='") 2467 param.append(ValueTerminal('=', 'parameter-separator')) 2468 value = value[1:] 2469 if value and value[0] in CFWS_LEADER: 2470 token, value = get_cfws(value) 2471 param.append(token) 2472 remainder = None 2473 appendto = param 2474 if param.extended and value and value[0] == '"': 2475 # Now for some serious hackery to handle the common invalid case of 2476 # double quotes around an extended value. We also accept (with defect) 2477 # a value marked as encoded that isn't really. 2478 qstring, remainder = get_quoted_string(value) 2479 inner_value = qstring.stripped_value 2480 semi_valid = False 2481 if param.section_number == 0: 2482 if inner_value and inner_value[0] == "'": 2483 semi_valid = True 2484 else: 2485 token, rest = get_attrtext(inner_value) 2486 if rest and rest[0] == "'": 2487 semi_valid = True 2488 else: 2489 try: 2490 token, rest = get_extended_attrtext(inner_value) 2491 except: 2492 pass 2493 else: 2494 if not rest: 2495 semi_valid = True 2496 if semi_valid: 2497 param.defects.append(errors.InvalidHeaderDefect( 2498 "Quoted string value for extended parameter is invalid")) 2499 param.append(qstring) 2500 for t in qstring: 2501 if t.token_type == 'bare-quoted-string': 2502 t[:] = [] 2503 appendto = t 2504 break 2505 value = inner_value 2506 else: 2507 remainder = None 2508 param.defects.append(errors.InvalidHeaderDefect( 2509 "Parameter marked as extended but appears to have a " 2510 "quoted string value that is non-encoded")) 2511 if value and value[0] == "'": 2512 token = None 2513 else: 2514 token, value = get_value(value) 2515 if not param.extended or param.section_number > 0: 2516 if not value or value[0] != "'": 2517 appendto.append(token) 2518 if remainder is not None: 2519 assert not value, value 2520 value = remainder 2521 return param, value 2522 param.defects.append(errors.InvalidHeaderDefect( 2523 "Apparent initial-extended-value but attribute " 2524 "was not marked as extended or was not initial section")) 2525 if not value: 2526 # Assume the charset/lang is missing and the token is the value. 2527 param.defects.append(errors.InvalidHeaderDefect( 2528 "Missing required charset/lang delimiters")) 2529 appendto.append(token) 2530 if remainder is None: 2531 return param, value 2532 else: 2533 if token is not None: 2534 for t in token: 2535 if t.token_type == 'extended-attrtext': 2536 break 2537 t.token_type == 'attrtext' 2538 appendto.append(t) 2539 param.charset = t.value 2540 if value[0] != "'": 2541 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2542 "delimiter, but found {!r}".format(value)) 2543 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2544 value = value[1:] 2545 if value and value[0] != "'": 2546 token, value = get_attrtext(value) 2547 appendto.append(token) 2548 param.lang = token.value 2549 if not value or value[0] != "'": 2550 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2551 "delimiter, but found {}".format(value)) 2552 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2553 value = value[1:] 2554 if remainder is not None: 2555 # Treat the rest of value as bare quoted string content. 2556 v = Value() 2557 while value: 2558 if value[0] in WSP: 2559 token, value = get_fws(value) 2560 elif value[0] == '"': 2561 token = ValueTerminal('"', 'DQUOTE') 2562 value = value[1:] 2563 else: 2564 token, value = get_qcontent(value) 2565 v.append(token) 2566 token = v 2567 else: 2568 token, value = get_value(value) 2569 appendto.append(token) 2570 if remainder is not None: 2571 assert not value, value 2572 value = remainder 2573 return param, value 2574 2575def parse_mime_parameters(value): 2576 """ parameter *( ";" parameter ) 2577 2578 That BNF is meant to indicate this routine should only be called after 2579 finding and handling the leading ';'. There is no corresponding rule in 2580 the formal RFC grammar, but it is more convenient for us for the set of 2581 parameters to be treated as its own TokenList. 2582 2583 This is 'parse' routine because it consumes the remaining value, but it 2584 would never be called to parse a full header. Instead it is called to 2585 parse everything after the non-parameter value of a specific MIME header. 2586 2587 """ 2588 mime_parameters = MimeParameters() 2589 while value: 2590 try: 2591 token, value = get_parameter(value) 2592 mime_parameters.append(token) 2593 except errors.HeaderParseError: 2594 leader = None 2595 if value[0] in CFWS_LEADER: 2596 leader, value = get_cfws(value) 2597 if not value: 2598 mime_parameters.append(leader) 2599 return mime_parameters 2600 if value[0] == ';': 2601 if leader is not None: 2602 mime_parameters.append(leader) 2603 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2604 "parameter entry with no content")) 2605 else: 2606 token, value = get_invalid_parameter(value) 2607 if leader: 2608 token[:0] = [leader] 2609 mime_parameters.append(token) 2610 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2611 "invalid parameter {!r}".format(token))) 2612 if value and value[0] != ';': 2613 # Junk after the otherwise valid parameter. Mark it as 2614 # invalid, but it will have a value. 2615 param = mime_parameters[-1] 2616 param.token_type = 'invalid-parameter' 2617 token, value = get_invalid_parameter(value) 2618 param.extend(token) 2619 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2620 "parameter with invalid trailing text {!r}".format(token))) 2621 if value: 2622 # Must be a ';' at this point. 2623 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) 2624 value = value[1:] 2625 return mime_parameters 2626 2627def _find_mime_parameters(tokenlist, value): 2628 """Do our best to find the parameters in an invalid MIME header 2629 2630 """ 2631 while value and value[0] != ';': 2632 if value[0] in PHRASE_ENDS: 2633 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) 2634 value = value[1:] 2635 else: 2636 token, value = get_phrase(value) 2637 tokenlist.append(token) 2638 if not value: 2639 return 2640 tokenlist.append(ValueTerminal(';', 'parameter-separator')) 2641 tokenlist.append(parse_mime_parameters(value[1:])) 2642 2643def parse_content_type_header(value): 2644 """ maintype "/" subtype *( ";" parameter ) 2645 2646 The maintype and substype are tokens. Theoretically they could 2647 be checked against the official IANA list + x-token, but we 2648 don't do that. 2649 """ 2650 ctype = ContentType() 2651 if not value: 2652 ctype.defects.append(errors.HeaderMissingRequiredValue( 2653 "Missing content type specification")) 2654 return ctype 2655 try: 2656 token, value = get_token(value) 2657 except errors.HeaderParseError: 2658 ctype.defects.append(errors.InvalidHeaderDefect( 2659 "Expected content maintype but found {!r}".format(value))) 2660 _find_mime_parameters(ctype, value) 2661 return ctype 2662 ctype.append(token) 2663 # XXX: If we really want to follow the formal grammar we should make 2664 # mantype and subtype specialized TokenLists here. Probably not worth it. 2665 if not value or value[0] != '/': 2666 ctype.defects.append(errors.InvalidHeaderDefect( 2667 "Invalid content type")) 2668 if value: 2669 _find_mime_parameters(ctype, value) 2670 return ctype 2671 ctype.maintype = token.value.strip().lower() 2672 ctype.append(ValueTerminal('/', 'content-type-separator')) 2673 value = value[1:] 2674 try: 2675 token, value = get_token(value) 2676 except errors.HeaderParseError: 2677 ctype.defects.append(errors.InvalidHeaderDefect( 2678 "Expected content subtype but found {!r}".format(value))) 2679 _find_mime_parameters(ctype, value) 2680 return ctype 2681 ctype.append(token) 2682 ctype.subtype = token.value.strip().lower() 2683 if not value: 2684 return ctype 2685 if value[0] != ';': 2686 ctype.defects.append(errors.InvalidHeaderDefect( 2687 "Only parameters are valid after content type, but " 2688 "found {!r}".format(value))) 2689 # The RFC requires that a syntactically invalid content-type be treated 2690 # as text/plain. Perhaps we should postel this, but we should probably 2691 # only do that if we were checking the subtype value against IANA. 2692 del ctype.maintype, ctype.subtype 2693 _find_mime_parameters(ctype, value) 2694 return ctype 2695 ctype.append(ValueTerminal(';', 'parameter-separator')) 2696 ctype.append(parse_mime_parameters(value[1:])) 2697 return ctype 2698 2699def parse_content_disposition_header(value): 2700 """ disposition-type *( ";" parameter ) 2701 2702 """ 2703 disp_header = ContentDisposition() 2704 if not value: 2705 disp_header.defects.append(errors.HeaderMissingRequiredValue( 2706 "Missing content disposition")) 2707 return disp_header 2708 try: 2709 token, value = get_token(value) 2710 except errors.HeaderParseError: 2711 disp_header.defects.append(errors.InvalidHeaderDefect( 2712 "Expected content disposition but found {!r}".format(value))) 2713 _find_mime_parameters(disp_header, value) 2714 return disp_header 2715 disp_header.append(token) 2716 disp_header.content_disposition = token.value.strip().lower() 2717 if not value: 2718 return disp_header 2719 if value[0] != ';': 2720 disp_header.defects.append(errors.InvalidHeaderDefect( 2721 "Only parameters are valid after content disposition, but " 2722 "found {!r}".format(value))) 2723 _find_mime_parameters(disp_header, value) 2724 return disp_header 2725 disp_header.append(ValueTerminal(';', 'parameter-separator')) 2726 disp_header.append(parse_mime_parameters(value[1:])) 2727 return disp_header 2728 2729def parse_content_transfer_encoding_header(value): 2730 """ mechanism 2731 2732 """ 2733 # We should probably validate the values, since the list is fixed. 2734 cte_header = ContentTransferEncoding() 2735 if not value: 2736 cte_header.defects.append(errors.HeaderMissingRequiredValue( 2737 "Missing content transfer encoding")) 2738 return cte_header 2739 try: 2740 token, value = get_token(value) 2741 except errors.HeaderParseError: 2742 cte_header.defects.append(errors.InvalidHeaderDefect( 2743 "Expected content transfer encoding but found {!r}".format(value))) 2744 else: 2745 cte_header.append(token) 2746 cte_header.cte = token.value.strip().lower() 2747 if not value: 2748 return cte_header 2749 while value: 2750 cte_header.defects.append(errors.InvalidHeaderDefect( 2751 "Extra text after content transfer encoding")) 2752 if value[0] in PHRASE_ENDS: 2753 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) 2754 value = value[1:] 2755 else: 2756 token, value = get_phrase(value) 2757 cte_header.append(token) 2758 return cte_header 2759 2760 2761# 2762# Header folding 2763# 2764# Header folding is complex, with lots of rules and corner cases. The 2765# following code does its best to obey the rules and handle the corner 2766# cases, but you can be sure there are few bugs:) 2767# 2768# This folder generally canonicalizes as it goes, preferring the stringified 2769# version of each token. The tokens contain information that supports the 2770# folder, including which tokens can be encoded in which ways. 2771# 2772# Folded text is accumulated in a simple list of strings ('lines'), each 2773# one of which should be less than policy.max_line_length ('maxlen'). 2774# 2775 2776def _steal_trailing_WSP_if_exists(lines): 2777 wsp = '' 2778 if lines and lines[-1] and lines[-1][-1] in WSP: 2779 wsp = lines[-1][-1] 2780 lines[-1] = lines[-1][:-1] 2781 return wsp 2782 2783def _refold_parse_tree(parse_tree, *, policy): 2784 """Return string of contents of parse_tree folded according to RFC rules. 2785 2786 """ 2787 # max_line_length 0/None means no limit, ie: infinitely long. 2788 maxlen = policy.max_line_length or sys.maxsize 2789 encoding = 'utf-8' if policy.utf8 else 'us-ascii' 2790 lines = [''] # Folded lines to be output 2791 leading_whitespace = '' # When we have whitespace between two encoded 2792 # words, we may need to encode the whitespace 2793 # at the beginning of the second word. 2794 last_ew = None # Points to the last encoded character if there's an ew on 2795 # the line 2796 last_charset = None 2797 wrap_as_ew_blocked = 0 2798 want_encoding = False # This is set to True if we need to encode this part 2799 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') 2800 parts = list(parse_tree) 2801 while parts: 2802 part = parts.pop(0) 2803 if part is end_ew_not_allowed: 2804 wrap_as_ew_blocked -= 1 2805 continue 2806 tstr = str(part) 2807 if not want_encoding: 2808 if part.token_type == 'ptext': 2809 # Encode if tstr contains special characters. 2810 want_encoding = not SPECIALSNL.isdisjoint(tstr) 2811 else: 2812 # Encode if tstr contains newlines. 2813 want_encoding = not NLSET.isdisjoint(tstr) 2814 try: 2815 tstr.encode(encoding) 2816 charset = encoding 2817 except UnicodeEncodeError: 2818 if any(isinstance(x, errors.UndecodableBytesDefect) 2819 for x in part.all_defects): 2820 charset = 'unknown-8bit' 2821 else: 2822 # If policy.utf8 is false this should really be taken from a 2823 # 'charset' property on the policy. 2824 charset = 'utf-8' 2825 want_encoding = True 2826 2827 if part.token_type == 'mime-parameters': 2828 # Mime parameter folding (using RFC2231) is extra special. 2829 _fold_mime_parameters(part, lines, maxlen, encoding) 2830 continue 2831 2832 if want_encoding and not wrap_as_ew_blocked: 2833 if not part.as_ew_allowed: 2834 want_encoding = False 2835 last_ew = None 2836 if part.syntactic_break: 2837 encoded_part = part.fold(policy=policy)[:-len(policy.linesep)] 2838 if policy.linesep not in encoded_part: 2839 # It fits on a single line 2840 if len(encoded_part) > maxlen - len(lines[-1]): 2841 # But not on this one, so start a new one. 2842 newline = _steal_trailing_WSP_if_exists(lines) 2843 # XXX what if encoded_part has no leading FWS? 2844 lines.append(newline) 2845 lines[-1] += encoded_part 2846 continue 2847 # Either this is not a major syntactic break, so we don't 2848 # want it on a line by itself even if it fits, or it 2849 # doesn't fit on a line by itself. Either way, fall through 2850 # to unpacking the subparts and wrapping them. 2851 if not hasattr(part, 'encode'): 2852 # It's not a Terminal, do each piece individually. 2853 parts = list(part) + parts 2854 want_encoding = False 2855 continue 2856 elif part.as_ew_allowed: 2857 # It's a terminal, wrap it as an encoded word, possibly 2858 # combining it with previously encoded words if allowed. 2859 if (last_ew is not None and 2860 charset != last_charset and 2861 (last_charset == 'unknown-8bit' or 2862 last_charset == 'utf-8' and charset != 'us-ascii')): 2863 last_ew = None 2864 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, 2865 part.ew_combine_allowed, charset, leading_whitespace) 2866 # This whitespace has been added to the lines in _fold_as_ew() 2867 # so clear it now. 2868 leading_whitespace = '' 2869 last_charset = charset 2870 want_encoding = False 2871 continue 2872 else: 2873 # It's a terminal which should be kept non-encoded 2874 # (e.g. a ListSeparator). 2875 last_ew = None 2876 want_encoding = False 2877 # fall through 2878 2879 if len(tstr) <= maxlen - len(lines[-1]): 2880 lines[-1] += tstr 2881 continue 2882 2883 # This part is too long to fit. The RFC wants us to break at 2884 # "major syntactic breaks", so unless we don't consider this 2885 # to be one, check if it will fit on the next line by itself. 2886 leading_whitespace = '' 2887 if (part.syntactic_break and 2888 len(tstr) + 1 <= maxlen): 2889 newline = _steal_trailing_WSP_if_exists(lines) 2890 if newline or part.startswith_fws(): 2891 # We're going to fold the data onto a new line here. Due to 2892 # the way encoded strings handle continuation lines, we need to 2893 # be prepared to encode any whitespace if the next line turns 2894 # out to start with an encoded word. 2895 lines.append(newline + tstr) 2896 2897 whitespace_accumulator = [] 2898 for char in lines[-1]: 2899 if char not in WSP: 2900 break 2901 whitespace_accumulator.append(char) 2902 leading_whitespace = ''.join(whitespace_accumulator) 2903 last_ew = None 2904 continue 2905 if not hasattr(part, 'encode'): 2906 # It's not a terminal, try folding the subparts. 2907 newparts = list(part) 2908 if not part.as_ew_allowed: 2909 wrap_as_ew_blocked += 1 2910 newparts.append(end_ew_not_allowed) 2911 parts = newparts + parts 2912 continue 2913 if part.as_ew_allowed and not wrap_as_ew_blocked: 2914 # It doesn't need CTE encoding, but encode it anyway so we can 2915 # wrap it. 2916 parts.insert(0, part) 2917 want_encoding = True 2918 continue 2919 # We can't figure out how to wrap, it, so give up. 2920 newline = _steal_trailing_WSP_if_exists(lines) 2921 if newline or part.startswith_fws(): 2922 lines.append(newline + tstr) 2923 else: 2924 # We can't fold it onto the next line either... 2925 lines[-1] += tstr 2926 2927 return policy.linesep.join(lines) + policy.linesep 2928 2929def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace): 2930 """Fold string to_encode into lines as encoded word, combining if allowed. 2931 Return the new value for last_ew, or None if ew_combine_allowed is False. 2932 2933 If there is already an encoded word in the last line of lines (indicated by 2934 a non-None value for last_ew) and ew_combine_allowed is true, decode the 2935 existing ew, combine it with to_encode, and re-encode. Otherwise, encode 2936 to_encode. In either case, split to_encode as necessary so that the 2937 encoded segments fit within maxlen. 2938 2939 """ 2940 if last_ew is not None and ew_combine_allowed: 2941 to_encode = str( 2942 get_unstructured(lines[-1][last_ew:] + to_encode)) 2943 lines[-1] = lines[-1][:last_ew] 2944 elif to_encode[0] in WSP: 2945 # We're joining this to non-encoded text, so don't encode 2946 # the leading blank. 2947 leading_wsp = to_encode[0] 2948 to_encode = to_encode[1:] 2949 if (len(lines[-1]) == maxlen): 2950 lines.append(_steal_trailing_WSP_if_exists(lines)) 2951 lines[-1] += leading_wsp 2952 2953 trailing_wsp = '' 2954 if to_encode[-1] in WSP: 2955 # Likewise for the trailing space. 2956 trailing_wsp = to_encode[-1] 2957 to_encode = to_encode[:-1] 2958 new_last_ew = len(lines[-1]) if last_ew is None else last_ew 2959 2960 encode_as = 'utf-8' if charset == 'us-ascii' else charset 2961 2962 # The RFC2047 chrome takes up 7 characters plus the length 2963 # of the charset name. 2964 chrome_len = len(encode_as) + 7 2965 2966 if (chrome_len + 1) >= maxlen: 2967 raise errors.HeaderParseError( 2968 "max_line_length is too small to fit an encoded word") 2969 2970 while to_encode: 2971 remaining_space = maxlen - len(lines[-1]) 2972 text_space = remaining_space - chrome_len - len(leading_whitespace) 2973 if text_space <= 0: 2974 lines.append(' ') 2975 continue 2976 2977 # If we are at the start of a continuation line, prepend whitespace 2978 # (we only want to do this when the line starts with an encoded word 2979 # but if we're folding in this helper function, then we know that we 2980 # are going to be writing out an encoded word.) 2981 if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace: 2982 encoded_word = _ew.encode(leading_whitespace, charset=encode_as) 2983 lines[-1] += encoded_word 2984 leading_whitespace = '' 2985 2986 to_encode_word = to_encode[:text_space] 2987 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 2988 excess = len(encoded_word) - remaining_space 2989 while excess > 0: 2990 # Since the chunk to encode is guaranteed to fit into less than 100 characters, 2991 # shrinking it by one at a time shouldn't take long. 2992 to_encode_word = to_encode_word[:-1] 2993 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 2994 excess = len(encoded_word) - remaining_space 2995 lines[-1] += encoded_word 2996 to_encode = to_encode[len(to_encode_word):] 2997 leading_whitespace = '' 2998 2999 if to_encode: 3000 lines.append(' ') 3001 new_last_ew = len(lines[-1]) 3002 lines[-1] += trailing_wsp 3003 return new_last_ew if ew_combine_allowed else None 3004 3005def _fold_mime_parameters(part, lines, maxlen, encoding): 3006 """Fold TokenList 'part' into the 'lines' list as mime parameters. 3007 3008 Using the decoded list of parameters and values, format them according to 3009 the RFC rules, including using RFC2231 encoding if the value cannot be 3010 expressed in 'encoding' and/or the parameter+value is too long to fit 3011 within 'maxlen'. 3012 3013 """ 3014 # Special case for RFC2231 encoding: start from decoded values and use 3015 # RFC2231 encoding iff needed. 3016 # 3017 # Note that the 1 and 2s being added to the length calculations are 3018 # accounting for the possibly-needed spaces and semicolons we'll be adding. 3019 # 3020 for name, value in part.params: 3021 # XXX What if this ';' puts us over maxlen the first time through the 3022 # loop? We should split the header value onto a newline in that case, 3023 # but to do that we need to recognize the need earlier or reparse the 3024 # header, so I'm going to ignore that bug for now. It'll only put us 3025 # one character over. 3026 if not lines[-1].rstrip().endswith(';'): 3027 lines[-1] += ';' 3028 charset = encoding 3029 error_handler = 'strict' 3030 try: 3031 value.encode(encoding) 3032 encoding_required = False 3033 except UnicodeEncodeError: 3034 encoding_required = True 3035 if utils._has_surrogates(value): 3036 charset = 'unknown-8bit' 3037 error_handler = 'surrogateescape' 3038 else: 3039 charset = 'utf-8' 3040 if encoding_required: 3041 encoded_value = urllib.parse.quote( 3042 value, safe='', errors=error_handler) 3043 tstr = "{}*={}''{}".format(name, charset, encoded_value) 3044 else: 3045 tstr = '{}={}'.format(name, quote_string(value)) 3046 if len(lines[-1]) + len(tstr) + 1 < maxlen: 3047 lines[-1] = lines[-1] + ' ' + tstr 3048 continue 3049 elif len(tstr) + 2 <= maxlen: 3050 lines.append(' ' + tstr) 3051 continue 3052 # We need multiple sections. We are allowed to mix encoded and 3053 # non-encoded sections, but we aren't going to. We'll encode them all. 3054 section = 0 3055 extra_chrome = charset + "''" 3056 while value: 3057 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) 3058 if maxlen <= chrome_len + 3: 3059 # We need room for the leading blank, the trailing semicolon, 3060 # and at least one character of the value. If we don't 3061 # have that, we'd be stuck, so in that case fall back to 3062 # the RFC standard width. 3063 maxlen = 78 3064 splitpoint = maxchars = maxlen - chrome_len - 2 3065 while True: 3066 partial = value[:splitpoint] 3067 encoded_value = urllib.parse.quote( 3068 partial, safe='', errors=error_handler) 3069 if len(encoded_value) <= maxchars: 3070 break 3071 splitpoint -= 1 3072 lines.append(" {}*{}*={}{}".format( 3073 name, section, extra_chrome, encoded_value)) 3074 extra_chrome = '' 3075 section += 1 3076 value = value[splitpoint:] 3077 if value: 3078 lines[-1] += ';' 3079