1"""Header value parser implementing various email-related RFC parsing rules. 2 3The parsing methods defined in this module implement various email related 4parsing rules. Principal among them is RFC 5322, which is the followon 5to RFC 2822 and primarily a clarification of the former. It also implements 6RFC 2047 encoded word decoding. 7 8RFC 5322 goes to considerable trouble to maintain backward compatibility with 9RFC 822 in the parse phase, while cleaning up the structure on the generation 10phase. This parser supports correct RFC 5322 generation by tagging white space 11as folding white space only when folding is allowed in the non-obsolete rule 12sets. Actually, the parser is even more generous when accepting input than RFC 135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. 14Where possible deviations from the standard are annotated on the 'defects' 15attribute of tokens that deviate. 16 17The general structure of the parser follows RFC 5322, and uses its terminology 18where there is a direct correspondence. Where the implementation requires a 19somewhat different structure than that used by the formal grammar, new terms 20that mimic the closest existing terms are used. Thus, it really helps to have 21a copy of RFC 5322 handy when studying this code. 22 23Input to the parser is a string that has already been unfolded according to 24RFC 5322 rules. According to the RFC this unfolding is the very first step, and 25this parser leaves the unfolding step to a higher level message parser, which 26will have already detected the line breaks that need unfolding while 27determining the beginning and end of each header. 28 29The output of the parser is a TokenList object, which is a list subclass. A 30TokenList is a recursive data structure. The terminal nodes of the structure 31are Terminal objects, which are subclasses of str. These do not correspond 32directly to terminal objects in the formal grammar, but are instead more 33practical higher level combinations of true terminals. 34 35All TokenList and Terminal objects have a 'value' attribute, which produces the 36semantically meaningful value of that part of the parse subtree. The value of 37all whitespace tokens (no matter how many sub-tokens they may contain) is a 38single space, as per the RFC rules. This includes 'CFWS', which is herein 39included in the general class of whitespace tokens. There is one exception to 40the rule that whitespace tokens are collapsed into single spaces in values: in 41the value of a 'bare-quoted-string' (a quoted-string with no leading or 42trailing whitespace), any whitespace that appeared between the quotation marks 43is preserved in the returned value. Note that in all Terminal strings quoted 44pairs are turned into their unquoted values. 45 46All TokenList and Terminal objects also have a string value, which attempts to 47be a "canonical" representation of the RFC-compliant form of the substring that 48produced the parsed subtree, including minimal use of quoted pair quoting. 49Whitespace runs are not collapsed. 50 51Comment tokens also have a 'content' attribute providing the string found 52between the parens (including any nested comments) with whitespace preserved. 53 54All TokenList and Terminal objects have a 'defects' attribute which is a 55possibly empty list all of the defects found while creating the token. Defects 56may appear on any token in the tree, and a composite list of all defects in the 57subtree is available through the 'all_defects' attribute of any node. (For 58Terminal notes x.defects == x.all_defects.) 59 60Each object in a parse tree is called a 'token', and each has a 'token_type' 61attribute that gives the name from the RFC 5322 grammar that it represents. 62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that 63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. 64It is returned in place of lists of (ctext/quoted-pair) and 65(qtext/quoted-pair). 66 67XXX: provide complete list of token types. 68""" 69 70import re 71import sys 72import urllib # For urllib.parse.unquote 73from string import hexdigits 74from operator import itemgetter 75from email import _encoded_words as _ew 76from email import errors 77from email import utils 78 79# 80# Useful constants and functions 81# 82 83WSP = set(' \t') 84CFWS_LEADER = WSP | set('(') 85SPECIALS = set(r'()<>@,:;.\"[]') 86ATOM_ENDS = SPECIALS | WSP 87DOT_ATOM_ENDS = ATOM_ENDS - set('.') 88# '.', '"', and '(' do not end phrases in order to support obs-phrase 89PHRASE_ENDS = SPECIALS - set('."(') 90TSPECIALS = (SPECIALS | set('/?=')) - set('.') 91TOKEN_ENDS = TSPECIALS | WSP 92ASPECIALS = TSPECIALS | set("*'%") 93ATTRIBUTE_ENDS = ASPECIALS | WSP 94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') 95 96def quote_string(value): 97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' 98 99# Match a RFC 2047 word, looks like =?utf-8?q?someword?= 100rfc2047_matcher = re.compile(r''' 101 =\? # literal =? 102 [^?]* # charset 103 \? # literal ? 104 [qQbB] # literal 'q' or 'b', case insensitive 105 \? # literal ? 106 .*? # encoded word 107 \?= # literal ?= 108''', re.VERBOSE | re.MULTILINE) 109 110 111# 112# TokenList and its subclasses 113# 114 115class TokenList(list): 116 117 token_type = None 118 syntactic_break = True 119 ew_combine_allowed = True 120 121 def __init__(self, *args, **kw): 122 super().__init__(*args, **kw) 123 self.defects = [] 124 125 def __str__(self): 126 return ''.join(str(x) for x in self) 127 128 def __repr__(self): 129 return '{}({})'.format(self.__class__.__name__, 130 super().__repr__()) 131 132 @property 133 def value(self): 134 return ''.join(x.value for x in self if x.value) 135 136 @property 137 def all_defects(self): 138 return sum((x.all_defects for x in self), self.defects) 139 140 def startswith_fws(self): 141 return self[0].startswith_fws() 142 143 @property 144 def as_ew_allowed(self): 145 """True if all top level tokens of this part may be RFC2047 encoded.""" 146 return all(part.as_ew_allowed for part in self) 147 148 @property 149 def comments(self): 150 comments = [] 151 for token in self: 152 comments.extend(token.comments) 153 return comments 154 155 def fold(self, *, policy): 156 return _refold_parse_tree(self, policy=policy) 157 158 def pprint(self, indent=''): 159 print(self.ppstr(indent=indent)) 160 161 def ppstr(self, indent=''): 162 return '\n'.join(self._pp(indent=indent)) 163 164 def _pp(self, indent=''): 165 yield '{}{}/{}('.format( 166 indent, 167 self.__class__.__name__, 168 self.token_type) 169 for token in self: 170 if not hasattr(token, '_pp'): 171 yield (indent + ' !! invalid element in token ' 172 'list: {!r}'.format(token)) 173 else: 174 yield from token._pp(indent+' ') 175 if self.defects: 176 extra = ' Defects: {}'.format(self.defects) 177 else: 178 extra = '' 179 yield '{}){}'.format(indent, extra) 180 181 182class WhiteSpaceTokenList(TokenList): 183 184 @property 185 def value(self): 186 return ' ' 187 188 @property 189 def comments(self): 190 return [x.content for x in self if x.token_type=='comment'] 191 192 193class UnstructuredTokenList(TokenList): 194 token_type = 'unstructured' 195 196 197class Phrase(TokenList): 198 token_type = 'phrase' 199 200class Word(TokenList): 201 token_type = 'word' 202 203 204class CFWSList(WhiteSpaceTokenList): 205 token_type = 'cfws' 206 207 208class Atom(TokenList): 209 token_type = 'atom' 210 211 212class Token(TokenList): 213 token_type = 'token' 214 encode_as_ew = False 215 216 217class EncodedWord(TokenList): 218 token_type = 'encoded-word' 219 cte = None 220 charset = None 221 lang = None 222 223 224class QuotedString(TokenList): 225 226 token_type = 'quoted-string' 227 228 @property 229 def content(self): 230 for x in self: 231 if x.token_type == 'bare-quoted-string': 232 return x.value 233 234 @property 235 def quoted_value(self): 236 res = [] 237 for x in self: 238 if x.token_type == 'bare-quoted-string': 239 res.append(str(x)) 240 else: 241 res.append(x.value) 242 return ''.join(res) 243 244 @property 245 def stripped_value(self): 246 for token in self: 247 if token.token_type == 'bare-quoted-string': 248 return token.value 249 250 251class BareQuotedString(QuotedString): 252 253 token_type = 'bare-quoted-string' 254 255 def __str__(self): 256 return quote_string(''.join(str(x) for x in self)) 257 258 @property 259 def value(self): 260 return ''.join(str(x) for x in self) 261 262 263class Comment(WhiteSpaceTokenList): 264 265 token_type = 'comment' 266 267 def __str__(self): 268 return ''.join(sum([ 269 ["("], 270 [self.quote(x) for x in self], 271 [")"], 272 ], [])) 273 274 def quote(self, value): 275 if value.token_type == 'comment': 276 return str(value) 277 return str(value).replace('\\', '\\\\').replace( 278 '(', r'\(').replace( 279 ')', r'\)') 280 281 @property 282 def content(self): 283 return ''.join(str(x) for x in self) 284 285 @property 286 def comments(self): 287 return [self.content] 288 289class AddressList(TokenList): 290 291 token_type = 'address-list' 292 293 @property 294 def addresses(self): 295 return [x for x in self if x.token_type=='address'] 296 297 @property 298 def mailboxes(self): 299 return sum((x.mailboxes 300 for x in self if x.token_type=='address'), []) 301 302 @property 303 def all_mailboxes(self): 304 return sum((x.all_mailboxes 305 for x in self if x.token_type=='address'), []) 306 307 308class Address(TokenList): 309 310 token_type = 'address' 311 312 @property 313 def display_name(self): 314 if self[0].token_type == 'group': 315 return self[0].display_name 316 317 @property 318 def mailboxes(self): 319 if self[0].token_type == 'mailbox': 320 return [self[0]] 321 elif self[0].token_type == 'invalid-mailbox': 322 return [] 323 return self[0].mailboxes 324 325 @property 326 def all_mailboxes(self): 327 if self[0].token_type == 'mailbox': 328 return [self[0]] 329 elif self[0].token_type == 'invalid-mailbox': 330 return [self[0]] 331 return self[0].all_mailboxes 332 333class MailboxList(TokenList): 334 335 token_type = 'mailbox-list' 336 337 @property 338 def mailboxes(self): 339 return [x for x in self if x.token_type=='mailbox'] 340 341 @property 342 def all_mailboxes(self): 343 return [x for x in self 344 if x.token_type in ('mailbox', 'invalid-mailbox')] 345 346 347class GroupList(TokenList): 348 349 token_type = 'group-list' 350 351 @property 352 def mailboxes(self): 353 if not self or self[0].token_type != 'mailbox-list': 354 return [] 355 return self[0].mailboxes 356 357 @property 358 def all_mailboxes(self): 359 if not self or self[0].token_type != 'mailbox-list': 360 return [] 361 return self[0].all_mailboxes 362 363 364class Group(TokenList): 365 366 token_type = "group" 367 368 @property 369 def mailboxes(self): 370 if self[2].token_type != 'group-list': 371 return [] 372 return self[2].mailboxes 373 374 @property 375 def all_mailboxes(self): 376 if self[2].token_type != 'group-list': 377 return [] 378 return self[2].all_mailboxes 379 380 @property 381 def display_name(self): 382 return self[0].display_name 383 384 385class NameAddr(TokenList): 386 387 token_type = 'name-addr' 388 389 @property 390 def display_name(self): 391 if len(self) == 1: 392 return None 393 return self[0].display_name 394 395 @property 396 def local_part(self): 397 return self[-1].local_part 398 399 @property 400 def domain(self): 401 return self[-1].domain 402 403 @property 404 def route(self): 405 return self[-1].route 406 407 @property 408 def addr_spec(self): 409 return self[-1].addr_spec 410 411 412class AngleAddr(TokenList): 413 414 token_type = 'angle-addr' 415 416 @property 417 def local_part(self): 418 for x in self: 419 if x.token_type == 'addr-spec': 420 return x.local_part 421 422 @property 423 def domain(self): 424 for x in self: 425 if x.token_type == 'addr-spec': 426 return x.domain 427 428 @property 429 def route(self): 430 for x in self: 431 if x.token_type == 'obs-route': 432 return x.domains 433 434 @property 435 def addr_spec(self): 436 for x in self: 437 if x.token_type == 'addr-spec': 438 if x.local_part: 439 return x.addr_spec 440 else: 441 return quote_string(x.local_part) + x.addr_spec 442 else: 443 return '<>' 444 445 446class ObsRoute(TokenList): 447 448 token_type = 'obs-route' 449 450 @property 451 def domains(self): 452 return [x.domain for x in self if x.token_type == 'domain'] 453 454 455class Mailbox(TokenList): 456 457 token_type = 'mailbox' 458 459 @property 460 def display_name(self): 461 if self[0].token_type == 'name-addr': 462 return self[0].display_name 463 464 @property 465 def local_part(self): 466 return self[0].local_part 467 468 @property 469 def domain(self): 470 return self[0].domain 471 472 @property 473 def route(self): 474 if self[0].token_type == 'name-addr': 475 return self[0].route 476 477 @property 478 def addr_spec(self): 479 return self[0].addr_spec 480 481 482class InvalidMailbox(TokenList): 483 484 token_type = 'invalid-mailbox' 485 486 @property 487 def display_name(self): 488 return None 489 490 local_part = domain = route = addr_spec = display_name 491 492 493class Domain(TokenList): 494 495 token_type = 'domain' 496 as_ew_allowed = False 497 498 @property 499 def domain(self): 500 return ''.join(super().value.split()) 501 502 503class DotAtom(TokenList): 504 token_type = 'dot-atom' 505 506 507class DotAtomText(TokenList): 508 token_type = 'dot-atom-text' 509 as_ew_allowed = True 510 511 512class NoFoldLiteral(TokenList): 513 token_type = 'no-fold-literal' 514 as_ew_allowed = False 515 516 517class AddrSpec(TokenList): 518 519 token_type = 'addr-spec' 520 as_ew_allowed = False 521 522 @property 523 def local_part(self): 524 return self[0].local_part 525 526 @property 527 def domain(self): 528 if len(self) < 3: 529 return None 530 return self[-1].domain 531 532 @property 533 def value(self): 534 if len(self) < 3: 535 return self[0].value 536 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() 537 538 @property 539 def addr_spec(self): 540 nameset = set(self.local_part) 541 if len(nameset) > len(nameset-DOT_ATOM_ENDS): 542 lp = quote_string(self.local_part) 543 else: 544 lp = self.local_part 545 if self.domain is not None: 546 return lp + '@' + self.domain 547 return lp 548 549 550class ObsLocalPart(TokenList): 551 552 token_type = 'obs-local-part' 553 as_ew_allowed = False 554 555 556class DisplayName(Phrase): 557 558 token_type = 'display-name' 559 ew_combine_allowed = False 560 561 @property 562 def display_name(self): 563 res = TokenList(self) 564 if len(res) == 0: 565 return res.value 566 if res[0].token_type == 'cfws': 567 res.pop(0) 568 else: 569 if res[0][0].token_type == 'cfws': 570 res[0] = TokenList(res[0][1:]) 571 if res[-1].token_type == 'cfws': 572 res.pop() 573 else: 574 if res[-1][-1].token_type == 'cfws': 575 res[-1] = TokenList(res[-1][:-1]) 576 return res.value 577 578 @property 579 def value(self): 580 quote = False 581 if self.defects: 582 quote = True 583 else: 584 for x in self: 585 if x.token_type == 'quoted-string': 586 quote = True 587 if len(self) != 0 and quote: 588 pre = post = '' 589 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': 590 pre = ' ' 591 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': 592 post = ' ' 593 return pre+quote_string(self.display_name)+post 594 else: 595 return super().value 596 597 598class LocalPart(TokenList): 599 600 token_type = 'local-part' 601 as_ew_allowed = False 602 603 @property 604 def value(self): 605 if self[0].token_type == "quoted-string": 606 return self[0].quoted_value 607 else: 608 return self[0].value 609 610 @property 611 def local_part(self): 612 # Strip whitespace from front, back, and around dots. 613 res = [DOT] 614 last = DOT 615 last_is_tl = False 616 for tok in self[0] + [DOT]: 617 if tok.token_type == 'cfws': 618 continue 619 if (last_is_tl and tok.token_type == 'dot' and 620 last[-1].token_type == 'cfws'): 621 res[-1] = TokenList(last[:-1]) 622 is_tl = isinstance(tok, TokenList) 623 if (is_tl and last.token_type == 'dot' and 624 tok[0].token_type == 'cfws'): 625 res.append(TokenList(tok[1:])) 626 else: 627 res.append(tok) 628 last = res[-1] 629 last_is_tl = is_tl 630 res = TokenList(res[1:-1]) 631 return res.value 632 633 634class DomainLiteral(TokenList): 635 636 token_type = 'domain-literal' 637 as_ew_allowed = False 638 639 @property 640 def domain(self): 641 return ''.join(super().value.split()) 642 643 @property 644 def ip(self): 645 for x in self: 646 if x.token_type == 'ptext': 647 return x.value 648 649 650class MIMEVersion(TokenList): 651 652 token_type = 'mime-version' 653 major = None 654 minor = None 655 656 657class Parameter(TokenList): 658 659 token_type = 'parameter' 660 sectioned = False 661 extended = False 662 charset = 'us-ascii' 663 664 @property 665 def section_number(self): 666 # Because the first token, the attribute (name) eats CFWS, the second 667 # token is always the section if there is one. 668 return self[1].number if self.sectioned else 0 669 670 @property 671 def param_value(self): 672 # This is part of the "handle quoted extended parameters" hack. 673 for token in self: 674 if token.token_type == 'value': 675 return token.stripped_value 676 if token.token_type == 'quoted-string': 677 for token in token: 678 if token.token_type == 'bare-quoted-string': 679 for token in token: 680 if token.token_type == 'value': 681 return token.stripped_value 682 return '' 683 684 685class InvalidParameter(Parameter): 686 687 token_type = 'invalid-parameter' 688 689 690class Attribute(TokenList): 691 692 token_type = 'attribute' 693 694 @property 695 def stripped_value(self): 696 for token in self: 697 if token.token_type.endswith('attrtext'): 698 return token.value 699 700class Section(TokenList): 701 702 token_type = 'section' 703 number = None 704 705 706class Value(TokenList): 707 708 token_type = 'value' 709 710 @property 711 def stripped_value(self): 712 token = self[0] 713 if token.token_type == 'cfws': 714 token = self[1] 715 if token.token_type.endswith( 716 ('quoted-string', 'attribute', 'extended-attribute')): 717 return token.stripped_value 718 return self.value 719 720 721class MimeParameters(TokenList): 722 723 token_type = 'mime-parameters' 724 syntactic_break = False 725 726 @property 727 def params(self): 728 # The RFC specifically states that the ordering of parameters is not 729 # guaranteed and may be reordered by the transport layer. So we have 730 # to assume the RFC 2231 pieces can come in any order. However, we 731 # output them in the order that we first see a given name, which gives 732 # us a stable __str__. 733 params = {} # Using order preserving dict from Python 3.7+ 734 for token in self: 735 if not token.token_type.endswith('parameter'): 736 continue 737 if token[0].token_type != 'attribute': 738 continue 739 name = token[0].value.strip() 740 if name not in params: 741 params[name] = [] 742 params[name].append((token.section_number, token)) 743 for name, parts in params.items(): 744 parts = sorted(parts, key=itemgetter(0)) 745 first_param = parts[0][1] 746 charset = first_param.charset 747 # Our arbitrary error recovery is to ignore duplicate parameters, 748 # to use appearance order if there are duplicate rfc 2231 parts, 749 # and to ignore gaps. This mimics the error recovery of get_param. 750 if not first_param.extended and len(parts) > 1: 751 if parts[1][0] == 0: 752 parts[1][1].defects.append(errors.InvalidHeaderDefect( 753 'duplicate parameter name; duplicate(s) ignored')) 754 parts = parts[:1] 755 # Else assume the *0* was missing...note that this is different 756 # from get_param, but we registered a defect for this earlier. 757 value_parts = [] 758 i = 0 759 for section_number, param in parts: 760 if section_number != i: 761 # We could get fancier here and look for a complete 762 # duplicate extended parameter and ignore the second one 763 # seen. But we're not doing that. The old code didn't. 764 if not param.extended: 765 param.defects.append(errors.InvalidHeaderDefect( 766 'duplicate parameter name; duplicate ignored')) 767 continue 768 else: 769 param.defects.append(errors.InvalidHeaderDefect( 770 "inconsistent RFC2231 parameter numbering")) 771 i += 1 772 value = param.param_value 773 if param.extended: 774 try: 775 value = urllib.parse.unquote_to_bytes(value) 776 except UnicodeEncodeError: 777 # source had surrogate escaped bytes. What we do now 778 # is a bit of an open question. I'm not sure this is 779 # the best choice, but it is what the old algorithm did 780 value = urllib.parse.unquote(value, encoding='latin-1') 781 else: 782 try: 783 value = value.decode(charset, 'surrogateescape') 784 except (LookupError, UnicodeEncodeError): 785 # XXX: there should really be a custom defect for 786 # unknown character set to make it easy to find, 787 # because otherwise unknown charset is a silent 788 # failure. 789 value = value.decode('us-ascii', 'surrogateescape') 790 if utils._has_surrogates(value): 791 param.defects.append(errors.UndecodableBytesDefect()) 792 value_parts.append(value) 793 value = ''.join(value_parts) 794 yield name, value 795 796 def __str__(self): 797 params = [] 798 for name, value in self.params: 799 if value: 800 params.append('{}={}'.format(name, quote_string(value))) 801 else: 802 params.append(name) 803 params = '; '.join(params) 804 return ' ' + params if params else '' 805 806 807class ParameterizedHeaderValue(TokenList): 808 809 # Set this false so that the value doesn't wind up on a new line even 810 # if it and the parameters would fit there but not on the first line. 811 syntactic_break = False 812 813 @property 814 def params(self): 815 for token in reversed(self): 816 if token.token_type == 'mime-parameters': 817 return token.params 818 return {} 819 820 821class ContentType(ParameterizedHeaderValue): 822 token_type = 'content-type' 823 as_ew_allowed = False 824 maintype = 'text' 825 subtype = 'plain' 826 827 828class ContentDisposition(ParameterizedHeaderValue): 829 token_type = 'content-disposition' 830 as_ew_allowed = False 831 content_disposition = None 832 833 834class ContentTransferEncoding(TokenList): 835 token_type = 'content-transfer-encoding' 836 as_ew_allowed = False 837 cte = '7bit' 838 839 840class HeaderLabel(TokenList): 841 token_type = 'header-label' 842 as_ew_allowed = False 843 844 845class MsgID(TokenList): 846 token_type = 'msg-id' 847 as_ew_allowed = False 848 849 def fold(self, policy): 850 # message-id tokens may not be folded. 851 return str(self) + policy.linesep 852 853 854class MessageID(MsgID): 855 token_type = 'message-id' 856 857 858class InvalidMessageID(MessageID): 859 token_type = 'invalid-message-id' 860 861 862class Header(TokenList): 863 token_type = 'header' 864 865 866# 867# Terminal classes and instances 868# 869 870class Terminal(str): 871 872 as_ew_allowed = True 873 ew_combine_allowed = True 874 syntactic_break = True 875 876 def __new__(cls, value, token_type): 877 self = super().__new__(cls, value) 878 self.token_type = token_type 879 self.defects = [] 880 return self 881 882 def __repr__(self): 883 return "{}({})".format(self.__class__.__name__, super().__repr__()) 884 885 def pprint(self): 886 print(self.__class__.__name__ + '/' + self.token_type) 887 888 @property 889 def all_defects(self): 890 return list(self.defects) 891 892 def _pp(self, indent=''): 893 return ["{}{}/{}({}){}".format( 894 indent, 895 self.__class__.__name__, 896 self.token_type, 897 super().__repr__(), 898 '' if not self.defects else ' {}'.format(self.defects), 899 )] 900 901 def pop_trailing_ws(self): 902 # This terminates the recursion. 903 return None 904 905 @property 906 def comments(self): 907 return [] 908 909 def __getnewargs__(self): 910 return(str(self), self.token_type) 911 912 913class WhiteSpaceTerminal(Terminal): 914 915 @property 916 def value(self): 917 return ' ' 918 919 def startswith_fws(self): 920 return True 921 922 923class ValueTerminal(Terminal): 924 925 @property 926 def value(self): 927 return self 928 929 def startswith_fws(self): 930 return False 931 932 933class EWWhiteSpaceTerminal(WhiteSpaceTerminal): 934 935 @property 936 def value(self): 937 return '' 938 939 def __str__(self): 940 return '' 941 942 943class _InvalidEwError(errors.HeaderParseError): 944 """Invalid encoded word found while parsing headers.""" 945 946 947# XXX these need to become classes and used as instances so 948# that a program can't change them in a parse tree and screw 949# up other parse trees. Maybe should have tests for that, too. 950DOT = ValueTerminal('.', 'dot') 951ListSeparator = ValueTerminal(',', 'list-separator') 952ListSeparator.as_ew_allowed = False 953RouteComponentMarker = ValueTerminal('@', 'route-component-marker') 954 955# 956# Parser 957# 958 959# Parse strings according to RFC822/2047/2822/5322 rules. 960# 961# This is a stateless parser. Each get_XXX function accepts a string and 962# returns either a Terminal or a TokenList representing the RFC object named 963# by the method and a string containing the remaining unparsed characters 964# from the input. Thus a parser method consumes the next syntactic construct 965# of a given type and returns a token representing the construct plus the 966# unparsed remainder of the input string. 967# 968# For example, if the first element of a structured header is a 'phrase', 969# then: 970# 971# phrase, value = get_phrase(value) 972# 973# returns the complete phrase from the start of the string value, plus any 974# characters left in the string after the phrase is removed. 975 976_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split 977_non_atom_end_matcher = re.compile(r"[^{}]+".format( 978 re.escape(''.join(ATOM_ENDS)))).match 979_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall 980_non_token_end_matcher = re.compile(r"[^{}]+".format( 981 re.escape(''.join(TOKEN_ENDS)))).match 982_non_attribute_end_matcher = re.compile(r"[^{}]+".format( 983 re.escape(''.join(ATTRIBUTE_ENDS)))).match 984_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( 985 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match 986 987def _validate_xtext(xtext): 988 """If input token contains ASCII non-printables, register a defect.""" 989 990 non_printables = _non_printable_finder(xtext) 991 if non_printables: 992 xtext.defects.append(errors.NonPrintableDefect(non_printables)) 993 if utils._has_surrogates(xtext): 994 xtext.defects.append(errors.UndecodableBytesDefect( 995 "Non-ASCII characters found in header token")) 996 997def _get_ptext_to_endchars(value, endchars): 998 """Scan printables/quoted-pairs until endchars and return unquoted ptext. 999 1000 This function turns a run of qcontent, ccontent-without-comments, or 1001 dtext-with-quoted-printables into a single string by unquoting any 1002 quoted printables. It returns the string, the remaining value, and 1003 a flag that is True iff there were any quoted printables decoded. 1004 1005 """ 1006 fragment, *remainder = _wsp_splitter(value, 1) 1007 vchars = [] 1008 escape = False 1009 had_qp = False 1010 for pos in range(len(fragment)): 1011 if fragment[pos] == '\\': 1012 if escape: 1013 escape = False 1014 had_qp = True 1015 else: 1016 escape = True 1017 continue 1018 if escape: 1019 escape = False 1020 elif fragment[pos] in endchars: 1021 break 1022 vchars.append(fragment[pos]) 1023 else: 1024 pos = pos + 1 1025 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp 1026 1027def get_fws(value): 1028 """FWS = 1*WSP 1029 1030 This isn't the RFC definition. We're using fws to represent tokens where 1031 folding can be done, but when we are parsing the *un*folding has already 1032 been done so we don't need to watch out for CRLF. 1033 1034 """ 1035 newvalue = value.lstrip() 1036 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') 1037 return fws, newvalue 1038 1039def get_encoded_word(value): 1040 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 1041 1042 """ 1043 ew = EncodedWord() 1044 if not value.startswith('=?'): 1045 raise errors.HeaderParseError( 1046 "expected encoded word but found {}".format(value)) 1047 tok, *remainder = value[2:].split('?=', 1) 1048 if tok == value[2:]: 1049 raise errors.HeaderParseError( 1050 "expected encoded word but found {}".format(value)) 1051 remstr = ''.join(remainder) 1052 if (len(remstr) > 1 and 1053 remstr[0] in hexdigits and 1054 remstr[1] in hexdigits and 1055 tok.count('?') < 2): 1056 # The ? after the CTE was followed by an encoded word escape (=XX). 1057 rest, *remainder = remstr.split('?=', 1) 1058 tok = tok + '?=' + rest 1059 if len(tok.split()) > 1: 1060 ew.defects.append(errors.InvalidHeaderDefect( 1061 "whitespace inside encoded word")) 1062 ew.cte = value 1063 value = ''.join(remainder) 1064 try: 1065 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') 1066 except (ValueError, KeyError): 1067 raise _InvalidEwError( 1068 "encoded word format invalid: '{}'".format(ew.cte)) 1069 ew.charset = charset 1070 ew.lang = lang 1071 ew.defects.extend(defects) 1072 while text: 1073 if text[0] in WSP: 1074 token, text = get_fws(text) 1075 ew.append(token) 1076 continue 1077 chars, *remainder = _wsp_splitter(text, 1) 1078 vtext = ValueTerminal(chars, 'vtext') 1079 _validate_xtext(vtext) 1080 ew.append(vtext) 1081 text = ''.join(remainder) 1082 # Encoded words should be followed by a WS 1083 if value and value[0] not in WSP: 1084 ew.defects.append(errors.InvalidHeaderDefect( 1085 "missing trailing whitespace after encoded-word")) 1086 return ew, value 1087 1088def get_unstructured(value): 1089 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct 1090 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) 1091 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR 1092 1093 obs-NO-WS-CTL is control characters except WSP/CR/LF. 1094 1095 So, basically, we have printable runs, plus control characters or nulls in 1096 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the 1097 obsolete syntax in its specification, but requires whitespace on either 1098 side of the encoded words, I can see no reason to need to separate the 1099 non-printable-non-whitespace from the printable runs if they occur, so we 1100 parse this into xtext tokens separated by WSP tokens. 1101 1102 Because an 'unstructured' value must by definition constitute the entire 1103 value, this 'get' routine does not return a remaining value, only the 1104 parsed TokenList. 1105 1106 """ 1107 # XXX: but what about bare CR and LF? They might signal the start or 1108 # end of an encoded word. YAGNI for now, since our current parsers 1109 # will never send us strings with bare CR or LF. 1110 1111 unstructured = UnstructuredTokenList() 1112 while value: 1113 if value[0] in WSP: 1114 token, value = get_fws(value) 1115 unstructured.append(token) 1116 continue 1117 valid_ew = True 1118 if value.startswith('=?'): 1119 try: 1120 token, value = get_encoded_word(value) 1121 except _InvalidEwError: 1122 valid_ew = False 1123 except errors.HeaderParseError: 1124 # XXX: Need to figure out how to register defects when 1125 # appropriate here. 1126 pass 1127 else: 1128 have_ws = True 1129 if len(unstructured) > 0: 1130 if unstructured[-1].token_type != 'fws': 1131 unstructured.defects.append(errors.InvalidHeaderDefect( 1132 "missing whitespace before encoded word")) 1133 have_ws = False 1134 if have_ws and len(unstructured) > 1: 1135 if unstructured[-2].token_type == 'encoded-word': 1136 unstructured[-1] = EWWhiteSpaceTerminal( 1137 unstructured[-1], 'fws') 1138 unstructured.append(token) 1139 continue 1140 tok, *remainder = _wsp_splitter(value, 1) 1141 # Split in the middle of an atom if there is a rfc2047 encoded word 1142 # which does not have WSP on both sides. The defect will be registered 1143 # the next time through the loop. 1144 # This needs to only be performed when the encoded word is valid; 1145 # otherwise, performing it on an invalid encoded word can cause 1146 # the parser to go in an infinite loop. 1147 if valid_ew and rfc2047_matcher.search(tok): 1148 tok, *remainder = value.partition('=?') 1149 vtext = ValueTerminal(tok, 'vtext') 1150 _validate_xtext(vtext) 1151 unstructured.append(vtext) 1152 value = ''.join(remainder) 1153 return unstructured 1154 1155def get_qp_ctext(value): 1156 r"""ctext = <printable ascii except \ ( )> 1157 1158 This is not the RFC ctext, since we are handling nested comments in comment 1159 and unquoting quoted-pairs here. We allow anything except the '()' 1160 characters, but if we find any ASCII other than the RFC defined printable 1161 ASCII, a NonPrintableDefect is added to the token's defects list. Since 1162 quoted pairs are converted to their unquoted values, what is returned is 1163 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value 1164 is ' '. 1165 1166 """ 1167 ptext, value, _ = _get_ptext_to_endchars(value, '()') 1168 ptext = WhiteSpaceTerminal(ptext, 'ptext') 1169 _validate_xtext(ptext) 1170 return ptext, value 1171 1172def get_qcontent(value): 1173 """qcontent = qtext / quoted-pair 1174 1175 We allow anything except the DQUOTE character, but if we find any ASCII 1176 other than the RFC defined printable ASCII, a NonPrintableDefect is 1177 added to the token's defects list. Any quoted pairs are converted to their 1178 unquoted values, so what is returned is a 'ptext' token. In this case it 1179 is a ValueTerminal. 1180 1181 """ 1182 ptext, value, _ = _get_ptext_to_endchars(value, '"') 1183 ptext = ValueTerminal(ptext, 'ptext') 1184 _validate_xtext(ptext) 1185 return ptext, value 1186 1187def get_atext(value): 1188 """atext = <matches _atext_matcher> 1189 1190 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to 1191 the token's defects list if we find non-atext characters. 1192 """ 1193 m = _non_atom_end_matcher(value) 1194 if not m: 1195 raise errors.HeaderParseError( 1196 "expected atext but found '{}'".format(value)) 1197 atext = m.group() 1198 value = value[len(atext):] 1199 atext = ValueTerminal(atext, 'atext') 1200 _validate_xtext(atext) 1201 return atext, value 1202 1203def get_bare_quoted_string(value): 1204 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE 1205 1206 A quoted-string without the leading or trailing white space. Its 1207 value is the text between the quote marks, with whitespace 1208 preserved and quoted pairs decoded. 1209 """ 1210 if value[0] != '"': 1211 raise errors.HeaderParseError( 1212 "expected '\"' but found '{}'".format(value)) 1213 bare_quoted_string = BareQuotedString() 1214 value = value[1:] 1215 if value and value[0] == '"': 1216 token, value = get_qcontent(value) 1217 bare_quoted_string.append(token) 1218 while value and value[0] != '"': 1219 if value[0] in WSP: 1220 token, value = get_fws(value) 1221 elif value[:2] == '=?': 1222 valid_ew = False 1223 try: 1224 token, value = get_encoded_word(value) 1225 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1226 "encoded word inside quoted string")) 1227 valid_ew = True 1228 except errors.HeaderParseError: 1229 token, value = get_qcontent(value) 1230 # Collapse the whitespace between two encoded words that occur in a 1231 # bare-quoted-string. 1232 if valid_ew and len(bare_quoted_string) > 1: 1233 if (bare_quoted_string[-1].token_type == 'fws' and 1234 bare_quoted_string[-2].token_type == 'encoded-word'): 1235 bare_quoted_string[-1] = EWWhiteSpaceTerminal( 1236 bare_quoted_string[-1], 'fws') 1237 else: 1238 token, value = get_qcontent(value) 1239 bare_quoted_string.append(token) 1240 if not value: 1241 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1242 "end of header inside quoted string")) 1243 return bare_quoted_string, value 1244 return bare_quoted_string, value[1:] 1245 1246def get_comment(value): 1247 """comment = "(" *([FWS] ccontent) [FWS] ")" 1248 ccontent = ctext / quoted-pair / comment 1249 1250 We handle nested comments here, and quoted-pair in our qp-ctext routine. 1251 """ 1252 if value and value[0] != '(': 1253 raise errors.HeaderParseError( 1254 "expected '(' but found '{}'".format(value)) 1255 comment = Comment() 1256 value = value[1:] 1257 while value and value[0] != ")": 1258 if value[0] in WSP: 1259 token, value = get_fws(value) 1260 elif value[0] == '(': 1261 token, value = get_comment(value) 1262 else: 1263 token, value = get_qp_ctext(value) 1264 comment.append(token) 1265 if not value: 1266 comment.defects.append(errors.InvalidHeaderDefect( 1267 "end of header inside comment")) 1268 return comment, value 1269 return comment, value[1:] 1270 1271def get_cfws(value): 1272 """CFWS = (1*([FWS] comment) [FWS]) / FWS 1273 1274 """ 1275 cfws = CFWSList() 1276 while value and value[0] in CFWS_LEADER: 1277 if value[0] in WSP: 1278 token, value = get_fws(value) 1279 else: 1280 token, value = get_comment(value) 1281 cfws.append(token) 1282 return cfws, value 1283 1284def get_quoted_string(value): 1285 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] 1286 1287 'bare-quoted-string' is an intermediate class defined by this 1288 parser and not by the RFC grammar. It is the quoted string 1289 without any attached CFWS. 1290 """ 1291 quoted_string = QuotedString() 1292 if value and value[0] in CFWS_LEADER: 1293 token, value = get_cfws(value) 1294 quoted_string.append(token) 1295 token, value = get_bare_quoted_string(value) 1296 quoted_string.append(token) 1297 if value and value[0] in CFWS_LEADER: 1298 token, value = get_cfws(value) 1299 quoted_string.append(token) 1300 return quoted_string, value 1301 1302def get_atom(value): 1303 """atom = [CFWS] 1*atext [CFWS] 1304 1305 An atom could be an rfc2047 encoded word. 1306 """ 1307 atom = Atom() 1308 if value and value[0] in CFWS_LEADER: 1309 token, value = get_cfws(value) 1310 atom.append(token) 1311 if value and value[0] in ATOM_ENDS: 1312 raise errors.HeaderParseError( 1313 "expected atom but found '{}'".format(value)) 1314 if value.startswith('=?'): 1315 try: 1316 token, value = get_encoded_word(value) 1317 except errors.HeaderParseError: 1318 # XXX: need to figure out how to register defects when 1319 # appropriate here. 1320 token, value = get_atext(value) 1321 else: 1322 token, value = get_atext(value) 1323 atom.append(token) 1324 if value and value[0] in CFWS_LEADER: 1325 token, value = get_cfws(value) 1326 atom.append(token) 1327 return atom, value 1328 1329def get_dot_atom_text(value): 1330 """ dot-text = 1*atext *("." 1*atext) 1331 1332 """ 1333 dot_atom_text = DotAtomText() 1334 if not value or value[0] in ATOM_ENDS: 1335 raise errors.HeaderParseError("expected atom at a start of " 1336 "dot-atom-text but found '{}'".format(value)) 1337 while value and value[0] not in ATOM_ENDS: 1338 token, value = get_atext(value) 1339 dot_atom_text.append(token) 1340 if value and value[0] == '.': 1341 dot_atom_text.append(DOT) 1342 value = value[1:] 1343 if dot_atom_text[-1] is DOT: 1344 raise errors.HeaderParseError("expected atom at end of dot-atom-text " 1345 "but found '{}'".format('.'+value)) 1346 return dot_atom_text, value 1347 1348def get_dot_atom(value): 1349 """ dot-atom = [CFWS] dot-atom-text [CFWS] 1350 1351 Any place we can have a dot atom, we could instead have an rfc2047 encoded 1352 word. 1353 """ 1354 dot_atom = DotAtom() 1355 if value[0] in CFWS_LEADER: 1356 token, value = get_cfws(value) 1357 dot_atom.append(token) 1358 if value.startswith('=?'): 1359 try: 1360 token, value = get_encoded_word(value) 1361 except errors.HeaderParseError: 1362 # XXX: need to figure out how to register defects when 1363 # appropriate here. 1364 token, value = get_dot_atom_text(value) 1365 else: 1366 token, value = get_dot_atom_text(value) 1367 dot_atom.append(token) 1368 if value and value[0] in CFWS_LEADER: 1369 token, value = get_cfws(value) 1370 dot_atom.append(token) 1371 return dot_atom, value 1372 1373def get_word(value): 1374 """word = atom / quoted-string 1375 1376 Either atom or quoted-string may start with CFWS. We have to peel off this 1377 CFWS first to determine which type of word to parse. Afterward we splice 1378 the leading CFWS, if any, into the parsed sub-token. 1379 1380 If neither an atom or a quoted-string is found before the next special, a 1381 HeaderParseError is raised. 1382 1383 The token returned is either an Atom or a QuotedString, as appropriate. 1384 This means the 'word' level of the formal grammar is not represented in the 1385 parse tree; this is because having that extra layer when manipulating the 1386 parse tree is more confusing than it is helpful. 1387 1388 """ 1389 if value[0] in CFWS_LEADER: 1390 leader, value = get_cfws(value) 1391 else: 1392 leader = None 1393 if not value: 1394 raise errors.HeaderParseError( 1395 "Expected 'atom' or 'quoted-string' but found nothing.") 1396 if value[0]=='"': 1397 token, value = get_quoted_string(value) 1398 elif value[0] in SPECIALS: 1399 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " 1400 "but found '{}'".format(value)) 1401 else: 1402 token, value = get_atom(value) 1403 if leader is not None: 1404 token[:0] = [leader] 1405 return token, value 1406 1407def get_phrase(value): 1408 """ phrase = 1*word / obs-phrase 1409 obs-phrase = word *(word / "." / CFWS) 1410 1411 This means a phrase can be a sequence of words, periods, and CFWS in any 1412 order as long as it starts with at least one word. If anything other than 1413 words is detected, an ObsoleteHeaderDefect is added to the token's defect 1414 list. We also accept a phrase that starts with CFWS followed by a dot; 1415 this is registered as an InvalidHeaderDefect, since it is not supported by 1416 even the obsolete grammar. 1417 1418 """ 1419 phrase = Phrase() 1420 try: 1421 token, value = get_word(value) 1422 phrase.append(token) 1423 except errors.HeaderParseError: 1424 phrase.defects.append(errors.InvalidHeaderDefect( 1425 "phrase does not start with word")) 1426 while value and value[0] not in PHRASE_ENDS: 1427 if value[0]=='.': 1428 phrase.append(DOT) 1429 phrase.defects.append(errors.ObsoleteHeaderDefect( 1430 "period in 'phrase'")) 1431 value = value[1:] 1432 else: 1433 try: 1434 token, value = get_word(value) 1435 except errors.HeaderParseError: 1436 if value[0] in CFWS_LEADER: 1437 token, value = get_cfws(value) 1438 phrase.defects.append(errors.ObsoleteHeaderDefect( 1439 "comment found without atom")) 1440 else: 1441 raise 1442 phrase.append(token) 1443 return phrase, value 1444 1445def get_local_part(value): 1446 """ local-part = dot-atom / quoted-string / obs-local-part 1447 1448 """ 1449 local_part = LocalPart() 1450 leader = None 1451 if value[0] in CFWS_LEADER: 1452 leader, value = get_cfws(value) 1453 if not value: 1454 raise errors.HeaderParseError( 1455 "expected local-part but found '{}'".format(value)) 1456 try: 1457 token, value = get_dot_atom(value) 1458 except errors.HeaderParseError: 1459 try: 1460 token, value = get_word(value) 1461 except errors.HeaderParseError: 1462 if value[0] != '\\' and value[0] in PHRASE_ENDS: 1463 raise 1464 token = TokenList() 1465 if leader is not None: 1466 token[:0] = [leader] 1467 local_part.append(token) 1468 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1469 obs_local_part, value = get_obs_local_part(str(local_part) + value) 1470 if obs_local_part.token_type == 'invalid-obs-local-part': 1471 local_part.defects.append(errors.InvalidHeaderDefect( 1472 "local-part is not dot-atom, quoted-string, or obs-local-part")) 1473 else: 1474 local_part.defects.append(errors.ObsoleteHeaderDefect( 1475 "local-part is not a dot-atom (contains CFWS)")) 1476 local_part[0] = obs_local_part 1477 try: 1478 local_part.value.encode('ascii') 1479 except UnicodeEncodeError: 1480 local_part.defects.append(errors.NonASCIILocalPartDefect( 1481 "local-part contains non-ASCII characters)")) 1482 return local_part, value 1483 1484def get_obs_local_part(value): 1485 """ obs-local-part = word *("." word) 1486 """ 1487 obs_local_part = ObsLocalPart() 1488 last_non_ws_was_dot = False 1489 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1490 if value[0] == '.': 1491 if last_non_ws_was_dot: 1492 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1493 "invalid repeated '.'")) 1494 obs_local_part.append(DOT) 1495 last_non_ws_was_dot = True 1496 value = value[1:] 1497 continue 1498 elif value[0]=='\\': 1499 obs_local_part.append(ValueTerminal(value[0], 1500 'misplaced-special')) 1501 value = value[1:] 1502 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1503 "'\\' character outside of quoted-string/ccontent")) 1504 last_non_ws_was_dot = False 1505 continue 1506 if obs_local_part and obs_local_part[-1].token_type != 'dot': 1507 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1508 "missing '.' between words")) 1509 try: 1510 token, value = get_word(value) 1511 last_non_ws_was_dot = False 1512 except errors.HeaderParseError: 1513 if value[0] not in CFWS_LEADER: 1514 raise 1515 token, value = get_cfws(value) 1516 obs_local_part.append(token) 1517 if (obs_local_part[0].token_type == 'dot' or 1518 obs_local_part[0].token_type=='cfws' and 1519 obs_local_part[1].token_type=='dot'): 1520 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1521 "Invalid leading '.' in local part")) 1522 if (obs_local_part[-1].token_type == 'dot' or 1523 obs_local_part[-1].token_type=='cfws' and 1524 obs_local_part[-2].token_type=='dot'): 1525 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1526 "Invalid trailing '.' in local part")) 1527 if obs_local_part.defects: 1528 obs_local_part.token_type = 'invalid-obs-local-part' 1529 return obs_local_part, value 1530 1531def get_dtext(value): 1532 r""" dtext = <printable ascii except \ [ ]> / obs-dtext 1533 obs-dtext = obs-NO-WS-CTL / quoted-pair 1534 1535 We allow anything except the excluded characters, but if we find any 1536 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is 1537 added to the token's defects list. Quoted pairs are converted to their 1538 unquoted values, so what is returned is a ptext token, in this case a 1539 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is 1540 added to the returned token's defect list. 1541 1542 """ 1543 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') 1544 ptext = ValueTerminal(ptext, 'ptext') 1545 if had_qp: 1546 ptext.defects.append(errors.ObsoleteHeaderDefect( 1547 "quoted printable found in domain-literal")) 1548 _validate_xtext(ptext) 1549 return ptext, value 1550 1551def _check_for_early_dl_end(value, domain_literal): 1552 if value: 1553 return False 1554 domain_literal.append(errors.InvalidHeaderDefect( 1555 "end of input inside domain-literal")) 1556 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1557 return True 1558 1559def get_domain_literal(value): 1560 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] 1561 1562 """ 1563 domain_literal = DomainLiteral() 1564 if value[0] in CFWS_LEADER: 1565 token, value = get_cfws(value) 1566 domain_literal.append(token) 1567 if not value: 1568 raise errors.HeaderParseError("expected domain-literal") 1569 if value[0] != '[': 1570 raise errors.HeaderParseError("expected '[' at start of domain-literal " 1571 "but found '{}'".format(value)) 1572 value = value[1:] 1573 if _check_for_early_dl_end(value, domain_literal): 1574 return domain_literal, value 1575 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) 1576 if value[0] in WSP: 1577 token, value = get_fws(value) 1578 domain_literal.append(token) 1579 token, value = get_dtext(value) 1580 domain_literal.append(token) 1581 if _check_for_early_dl_end(value, domain_literal): 1582 return domain_literal, value 1583 if value[0] in WSP: 1584 token, value = get_fws(value) 1585 domain_literal.append(token) 1586 if _check_for_early_dl_end(value, domain_literal): 1587 return domain_literal, value 1588 if value[0] != ']': 1589 raise errors.HeaderParseError("expected ']' at end of domain-literal " 1590 "but found '{}'".format(value)) 1591 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1592 value = value[1:] 1593 if value and value[0] in CFWS_LEADER: 1594 token, value = get_cfws(value) 1595 domain_literal.append(token) 1596 return domain_literal, value 1597 1598def get_domain(value): 1599 """ domain = dot-atom / domain-literal / obs-domain 1600 obs-domain = atom *("." atom)) 1601 1602 """ 1603 domain = Domain() 1604 leader = None 1605 if value[0] in CFWS_LEADER: 1606 leader, value = get_cfws(value) 1607 if not value: 1608 raise errors.HeaderParseError( 1609 "expected domain but found '{}'".format(value)) 1610 if value[0] == '[': 1611 token, value = get_domain_literal(value) 1612 if leader is not None: 1613 token[:0] = [leader] 1614 domain.append(token) 1615 return domain, value 1616 try: 1617 token, value = get_dot_atom(value) 1618 except errors.HeaderParseError: 1619 token, value = get_atom(value) 1620 if value and value[0] == '@': 1621 raise errors.HeaderParseError('Invalid Domain') 1622 if leader is not None: 1623 token[:0] = [leader] 1624 domain.append(token) 1625 if value and value[0] == '.': 1626 domain.defects.append(errors.ObsoleteHeaderDefect( 1627 "domain is not a dot-atom (contains CFWS)")) 1628 if domain[0].token_type == 'dot-atom': 1629 domain[:] = domain[0] 1630 while value and value[0] == '.': 1631 domain.append(DOT) 1632 token, value = get_atom(value[1:]) 1633 domain.append(token) 1634 return domain, value 1635 1636def get_addr_spec(value): 1637 """ addr-spec = local-part "@" domain 1638 1639 """ 1640 addr_spec = AddrSpec() 1641 token, value = get_local_part(value) 1642 addr_spec.append(token) 1643 if not value or value[0] != '@': 1644 addr_spec.defects.append(errors.InvalidHeaderDefect( 1645 "addr-spec local part with no domain")) 1646 return addr_spec, value 1647 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) 1648 token, value = get_domain(value[1:]) 1649 addr_spec.append(token) 1650 return addr_spec, value 1651 1652def get_obs_route(value): 1653 """ obs-route = obs-domain-list ":" 1654 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) 1655 1656 Returns an obs-route token with the appropriate sub-tokens (that is, 1657 there is no obs-domain-list in the parse tree). 1658 """ 1659 obs_route = ObsRoute() 1660 while value and (value[0]==',' or value[0] in CFWS_LEADER): 1661 if value[0] in CFWS_LEADER: 1662 token, value = get_cfws(value) 1663 obs_route.append(token) 1664 elif value[0] == ',': 1665 obs_route.append(ListSeparator) 1666 value = value[1:] 1667 if not value or value[0] != '@': 1668 raise errors.HeaderParseError( 1669 "expected obs-route domain but found '{}'".format(value)) 1670 obs_route.append(RouteComponentMarker) 1671 token, value = get_domain(value[1:]) 1672 obs_route.append(token) 1673 while value and value[0]==',': 1674 obs_route.append(ListSeparator) 1675 value = value[1:] 1676 if not value: 1677 break 1678 if value[0] in CFWS_LEADER: 1679 token, value = get_cfws(value) 1680 obs_route.append(token) 1681 if value[0] == '@': 1682 obs_route.append(RouteComponentMarker) 1683 token, value = get_domain(value[1:]) 1684 obs_route.append(token) 1685 if not value: 1686 raise errors.HeaderParseError("end of header while parsing obs-route") 1687 if value[0] != ':': 1688 raise errors.HeaderParseError( "expected ':' marking end of " 1689 "obs-route but found '{}'".format(value)) 1690 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) 1691 return obs_route, value[1:] 1692 1693def get_angle_addr(value): 1694 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr 1695 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] 1696 1697 """ 1698 angle_addr = AngleAddr() 1699 if value[0] in CFWS_LEADER: 1700 token, value = get_cfws(value) 1701 angle_addr.append(token) 1702 if not value or value[0] != '<': 1703 raise errors.HeaderParseError( 1704 "expected angle-addr but found '{}'".format(value)) 1705 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) 1706 value = value[1:] 1707 # Although it is not legal per RFC5322, SMTP uses '<>' in certain 1708 # circumstances. 1709 if value[0] == '>': 1710 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1711 angle_addr.defects.append(errors.InvalidHeaderDefect( 1712 "null addr-spec in angle-addr")) 1713 value = value[1:] 1714 return angle_addr, value 1715 try: 1716 token, value = get_addr_spec(value) 1717 except errors.HeaderParseError: 1718 try: 1719 token, value = get_obs_route(value) 1720 angle_addr.defects.append(errors.ObsoleteHeaderDefect( 1721 "obsolete route specification in angle-addr")) 1722 except errors.HeaderParseError: 1723 raise errors.HeaderParseError( 1724 "expected addr-spec or obs-route but found '{}'".format(value)) 1725 angle_addr.append(token) 1726 token, value = get_addr_spec(value) 1727 angle_addr.append(token) 1728 if value and value[0] == '>': 1729 value = value[1:] 1730 else: 1731 angle_addr.defects.append(errors.InvalidHeaderDefect( 1732 "missing trailing '>' on angle-addr")) 1733 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1734 if value and value[0] in CFWS_LEADER: 1735 token, value = get_cfws(value) 1736 angle_addr.append(token) 1737 return angle_addr, value 1738 1739def get_display_name(value): 1740 """ display-name = phrase 1741 1742 Because this is simply a name-rule, we don't return a display-name 1743 token containing a phrase, but rather a display-name token with 1744 the content of the phrase. 1745 1746 """ 1747 display_name = DisplayName() 1748 token, value = get_phrase(value) 1749 display_name.extend(token[:]) 1750 display_name.defects = token.defects[:] 1751 return display_name, value 1752 1753 1754def get_name_addr(value): 1755 """ name-addr = [display-name] angle-addr 1756 1757 """ 1758 name_addr = NameAddr() 1759 # Both the optional display name and the angle-addr can start with cfws. 1760 leader = None 1761 if value[0] in CFWS_LEADER: 1762 leader, value = get_cfws(value) 1763 if not value: 1764 raise errors.HeaderParseError( 1765 "expected name-addr but found '{}'".format(leader)) 1766 if value[0] != '<': 1767 if value[0] in PHRASE_ENDS: 1768 raise errors.HeaderParseError( 1769 "expected name-addr but found '{}'".format(value)) 1770 token, value = get_display_name(value) 1771 if not value: 1772 raise errors.HeaderParseError( 1773 "expected name-addr but found '{}'".format(token)) 1774 if leader is not None: 1775 token[0][:0] = [leader] 1776 leader = None 1777 name_addr.append(token) 1778 token, value = get_angle_addr(value) 1779 if leader is not None: 1780 token[:0] = [leader] 1781 name_addr.append(token) 1782 return name_addr, value 1783 1784def get_mailbox(value): 1785 """ mailbox = name-addr / addr-spec 1786 1787 """ 1788 # The only way to figure out if we are dealing with a name-addr or an 1789 # addr-spec is to try parsing each one. 1790 mailbox = Mailbox() 1791 try: 1792 token, value = get_name_addr(value) 1793 except errors.HeaderParseError: 1794 try: 1795 token, value = get_addr_spec(value) 1796 except errors.HeaderParseError: 1797 raise errors.HeaderParseError( 1798 "expected mailbox but found '{}'".format(value)) 1799 if any(isinstance(x, errors.InvalidHeaderDefect) 1800 for x in token.all_defects): 1801 mailbox.token_type = 'invalid-mailbox' 1802 mailbox.append(token) 1803 return mailbox, value 1804 1805def get_invalid_mailbox(value, endchars): 1806 """ Read everything up to one of the chars in endchars. 1807 1808 This is outside the formal grammar. The InvalidMailbox TokenList that is 1809 returned acts like a Mailbox, but the data attributes are None. 1810 1811 """ 1812 invalid_mailbox = InvalidMailbox() 1813 while value and value[0] not in endchars: 1814 if value[0] in PHRASE_ENDS: 1815 invalid_mailbox.append(ValueTerminal(value[0], 1816 'misplaced-special')) 1817 value = value[1:] 1818 else: 1819 token, value = get_phrase(value) 1820 invalid_mailbox.append(token) 1821 return invalid_mailbox, value 1822 1823def get_mailbox_list(value): 1824 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 1825 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) 1826 1827 For this routine we go outside the formal grammar in order to improve error 1828 handling. We recognize the end of the mailbox list only at the end of the 1829 value or at a ';' (the group terminator). This is so that we can turn 1830 invalid mailboxes into InvalidMailbox tokens and continue parsing any 1831 remaining valid mailboxes. We also allow all mailbox entries to be null, 1832 and this condition is handled appropriately at a higher level. 1833 1834 """ 1835 mailbox_list = MailboxList() 1836 while value and value[0] != ';': 1837 try: 1838 token, value = get_mailbox(value) 1839 mailbox_list.append(token) 1840 except errors.HeaderParseError: 1841 leader = None 1842 if value[0] in CFWS_LEADER: 1843 leader, value = get_cfws(value) 1844 if not value or value[0] in ',;': 1845 mailbox_list.append(leader) 1846 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1847 "empty element in mailbox-list")) 1848 else: 1849 token, value = get_invalid_mailbox(value, ',;') 1850 if leader is not None: 1851 token[:0] = [leader] 1852 mailbox_list.append(token) 1853 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1854 "invalid mailbox in mailbox-list")) 1855 elif value[0] == ',': 1856 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1857 "empty element in mailbox-list")) 1858 else: 1859 token, value = get_invalid_mailbox(value, ',;') 1860 if leader is not None: 1861 token[:0] = [leader] 1862 mailbox_list.append(token) 1863 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1864 "invalid mailbox in mailbox-list")) 1865 if value and value[0] not in ',;': 1866 # Crap after mailbox; treat it as an invalid mailbox. 1867 # The mailbox info will still be available. 1868 mailbox = mailbox_list[-1] 1869 mailbox.token_type = 'invalid-mailbox' 1870 token, value = get_invalid_mailbox(value, ',;') 1871 mailbox.extend(token) 1872 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1873 "invalid mailbox in mailbox-list")) 1874 if value and value[0] == ',': 1875 mailbox_list.append(ListSeparator) 1876 value = value[1:] 1877 return mailbox_list, value 1878 1879 1880def get_group_list(value): 1881 """ group-list = mailbox-list / CFWS / obs-group-list 1882 obs-group-list = 1*([CFWS] ",") [CFWS] 1883 1884 """ 1885 group_list = GroupList() 1886 if not value: 1887 group_list.defects.append(errors.InvalidHeaderDefect( 1888 "end of header before group-list")) 1889 return group_list, value 1890 leader = None 1891 if value and value[0] in CFWS_LEADER: 1892 leader, value = get_cfws(value) 1893 if not value: 1894 # This should never happen in email parsing, since CFWS-only is a 1895 # legal alternative to group-list in a group, which is the only 1896 # place group-list appears. 1897 group_list.defects.append(errors.InvalidHeaderDefect( 1898 "end of header in group-list")) 1899 group_list.append(leader) 1900 return group_list, value 1901 if value[0] == ';': 1902 group_list.append(leader) 1903 return group_list, value 1904 token, value = get_mailbox_list(value) 1905 if len(token.all_mailboxes)==0: 1906 if leader is not None: 1907 group_list.append(leader) 1908 group_list.extend(token) 1909 group_list.defects.append(errors.ObsoleteHeaderDefect( 1910 "group-list with empty entries")) 1911 return group_list, value 1912 if leader is not None: 1913 token[:0] = [leader] 1914 group_list.append(token) 1915 return group_list, value 1916 1917def get_group(value): 1918 """ group = display-name ":" [group-list] ";" [CFWS] 1919 1920 """ 1921 group = Group() 1922 token, value = get_display_name(value) 1923 if not value or value[0] != ':': 1924 raise errors.HeaderParseError("expected ':' at end of group " 1925 "display name but found '{}'".format(value)) 1926 group.append(token) 1927 group.append(ValueTerminal(':', 'group-display-name-terminator')) 1928 value = value[1:] 1929 if value and value[0] == ';': 1930 group.append(ValueTerminal(';', 'group-terminator')) 1931 return group, value[1:] 1932 token, value = get_group_list(value) 1933 group.append(token) 1934 if not value: 1935 group.defects.append(errors.InvalidHeaderDefect( 1936 "end of header in group")) 1937 elif value[0] != ';': 1938 raise errors.HeaderParseError( 1939 "expected ';' at end of group but found {}".format(value)) 1940 group.append(ValueTerminal(';', 'group-terminator')) 1941 value = value[1:] 1942 if value and value[0] in CFWS_LEADER: 1943 token, value = get_cfws(value) 1944 group.append(token) 1945 return group, value 1946 1947def get_address(value): 1948 """ address = mailbox / group 1949 1950 Note that counter-intuitively, an address can be either a single address or 1951 a list of addresses (a group). This is why the returned Address object has 1952 a 'mailboxes' attribute which treats a single address as a list of length 1953 one. When you need to differentiate between to two cases, extract the single 1954 element, which is either a mailbox or a group token. 1955 1956 """ 1957 # The formal grammar isn't very helpful when parsing an address. mailbox 1958 # and group, especially when allowing for obsolete forms, start off very 1959 # similarly. It is only when you reach one of @, <, or : that you know 1960 # what you've got. So, we try each one in turn, starting with the more 1961 # likely of the two. We could perhaps make this more efficient by looking 1962 # for a phrase and then branching based on the next character, but that 1963 # would be a premature optimization. 1964 address = Address() 1965 try: 1966 token, value = get_group(value) 1967 except errors.HeaderParseError: 1968 try: 1969 token, value = get_mailbox(value) 1970 except errors.HeaderParseError: 1971 raise errors.HeaderParseError( 1972 "expected address but found '{}'".format(value)) 1973 address.append(token) 1974 return address, value 1975 1976def get_address_list(value): 1977 """ address_list = (address *("," address)) / obs-addr-list 1978 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) 1979 1980 We depart from the formal grammar here by continuing to parse until the end 1981 of the input, assuming the input to be entirely composed of an 1982 address-list. This is always true in email parsing, and allows us 1983 to skip invalid addresses to parse additional valid ones. 1984 1985 """ 1986 address_list = AddressList() 1987 while value: 1988 try: 1989 token, value = get_address(value) 1990 address_list.append(token) 1991 except errors.HeaderParseError as err: 1992 leader = None 1993 if value[0] in CFWS_LEADER: 1994 leader, value = get_cfws(value) 1995 if not value or value[0] == ',': 1996 address_list.append(leader) 1997 address_list.defects.append(errors.ObsoleteHeaderDefect( 1998 "address-list entry with no content")) 1999 else: 2000 token, value = get_invalid_mailbox(value, ',') 2001 if leader is not None: 2002 token[:0] = [leader] 2003 address_list.append(Address([token])) 2004 address_list.defects.append(errors.InvalidHeaderDefect( 2005 "invalid address in address-list")) 2006 elif value[0] == ',': 2007 address_list.defects.append(errors.ObsoleteHeaderDefect( 2008 "empty element in address-list")) 2009 else: 2010 token, value = get_invalid_mailbox(value, ',') 2011 if leader is not None: 2012 token[:0] = [leader] 2013 address_list.append(Address([token])) 2014 address_list.defects.append(errors.InvalidHeaderDefect( 2015 "invalid address in address-list")) 2016 if value and value[0] != ',': 2017 # Crap after address; treat it as an invalid mailbox. 2018 # The mailbox info will still be available. 2019 mailbox = address_list[-1][0] 2020 mailbox.token_type = 'invalid-mailbox' 2021 token, value = get_invalid_mailbox(value, ',') 2022 mailbox.extend(token) 2023 address_list.defects.append(errors.InvalidHeaderDefect( 2024 "invalid address in address-list")) 2025 if value: # Must be a , at this point. 2026 address_list.append(ListSeparator) 2027 value = value[1:] 2028 return address_list, value 2029 2030 2031def get_no_fold_literal(value): 2032 """ no-fold-literal = "[" *dtext "]" 2033 """ 2034 no_fold_literal = NoFoldLiteral() 2035 if not value: 2036 raise errors.HeaderParseError( 2037 "expected no-fold-literal but found '{}'".format(value)) 2038 if value[0] != '[': 2039 raise errors.HeaderParseError( 2040 "expected '[' at the start of no-fold-literal " 2041 "but found '{}'".format(value)) 2042 no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start')) 2043 value = value[1:] 2044 token, value = get_dtext(value) 2045 no_fold_literal.append(token) 2046 if not value or value[0] != ']': 2047 raise errors.HeaderParseError( 2048 "expected ']' at the end of no-fold-literal " 2049 "but found '{}'".format(value)) 2050 no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end')) 2051 return no_fold_literal, value[1:] 2052 2053def get_msg_id(value): 2054 """msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS] 2055 id-left = dot-atom-text / obs-id-left 2056 id-right = dot-atom-text / no-fold-literal / obs-id-right 2057 no-fold-literal = "[" *dtext "]" 2058 """ 2059 msg_id = MsgID() 2060 if value and value[0] in CFWS_LEADER: 2061 token, value = get_cfws(value) 2062 msg_id.append(token) 2063 if not value or value[0] != '<': 2064 raise errors.HeaderParseError( 2065 "expected msg-id but found '{}'".format(value)) 2066 msg_id.append(ValueTerminal('<', 'msg-id-start')) 2067 value = value[1:] 2068 # Parse id-left. 2069 try: 2070 token, value = get_dot_atom_text(value) 2071 except errors.HeaderParseError: 2072 try: 2073 # obs-id-left is same as local-part of add-spec. 2074 token, value = get_obs_local_part(value) 2075 msg_id.defects.append(errors.ObsoleteHeaderDefect( 2076 "obsolete id-left in msg-id")) 2077 except errors.HeaderParseError: 2078 raise errors.HeaderParseError( 2079 "expected dot-atom-text or obs-id-left" 2080 " but found '{}'".format(value)) 2081 msg_id.append(token) 2082 if not value or value[0] != '@': 2083 msg_id.defects.append(errors.InvalidHeaderDefect( 2084 "msg-id with no id-right")) 2085 # Even though there is no id-right, if the local part 2086 # ends with `>` let's just parse it too and return 2087 # along with the defect. 2088 if value and value[0] == '>': 2089 msg_id.append(ValueTerminal('>', 'msg-id-end')) 2090 value = value[1:] 2091 return msg_id, value 2092 msg_id.append(ValueTerminal('@', 'address-at-symbol')) 2093 value = value[1:] 2094 # Parse id-right. 2095 try: 2096 token, value = get_dot_atom_text(value) 2097 except errors.HeaderParseError: 2098 try: 2099 token, value = get_no_fold_literal(value) 2100 except errors.HeaderParseError as e: 2101 try: 2102 token, value = get_domain(value) 2103 msg_id.defects.append(errors.ObsoleteHeaderDefect( 2104 "obsolete id-right in msg-id")) 2105 except errors.HeaderParseError: 2106 raise errors.HeaderParseError( 2107 "expected dot-atom-text, no-fold-literal or obs-id-right" 2108 " but found '{}'".format(value)) 2109 msg_id.append(token) 2110 if value and value[0] == '>': 2111 value = value[1:] 2112 else: 2113 msg_id.defects.append(errors.InvalidHeaderDefect( 2114 "missing trailing '>' on msg-id")) 2115 msg_id.append(ValueTerminal('>', 'msg-id-end')) 2116 if value and value[0] in CFWS_LEADER: 2117 token, value = get_cfws(value) 2118 msg_id.append(token) 2119 return msg_id, value 2120 2121 2122def parse_message_id(value): 2123 """message-id = "Message-ID:" msg-id CRLF 2124 """ 2125 message_id = MessageID() 2126 try: 2127 token, value = get_msg_id(value) 2128 message_id.append(token) 2129 except errors.HeaderParseError as ex: 2130 token = get_unstructured(value) 2131 message_id = InvalidMessageID(token) 2132 message_id.defects.append( 2133 errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex))) 2134 else: 2135 # Value after parsing a valid msg_id should be None. 2136 if value: 2137 message_id.defects.append(errors.InvalidHeaderDefect( 2138 "Unexpected {!r}".format(value))) 2139 2140 return message_id 2141 2142# 2143# XXX: As I begin to add additional header parsers, I'm realizing we probably 2144# have two level of parser routines: the get_XXX methods that get a token in 2145# the grammar, and parse_XXX methods that parse an entire field value. So 2146# get_address_list above should really be a parse_ method, as probably should 2147# be get_unstructured. 2148# 2149 2150def parse_mime_version(value): 2151 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] 2152 2153 """ 2154 # The [CFWS] is implicit in the RFC 2045 BNF. 2155 # XXX: This routine is a bit verbose, should factor out a get_int method. 2156 mime_version = MIMEVersion() 2157 if not value: 2158 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2159 "Missing MIME version number (eg: 1.0)")) 2160 return mime_version 2161 if value[0] in CFWS_LEADER: 2162 token, value = get_cfws(value) 2163 mime_version.append(token) 2164 if not value: 2165 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2166 "Expected MIME version number but found only CFWS")) 2167 digits = '' 2168 while value and value[0] != '.' and value[0] not in CFWS_LEADER: 2169 digits += value[0] 2170 value = value[1:] 2171 if not digits.isdigit(): 2172 mime_version.defects.append(errors.InvalidHeaderDefect( 2173 "Expected MIME major version number but found {!r}".format(digits))) 2174 mime_version.append(ValueTerminal(digits, 'xtext')) 2175 else: 2176 mime_version.major = int(digits) 2177 mime_version.append(ValueTerminal(digits, 'digits')) 2178 if value and value[0] in CFWS_LEADER: 2179 token, value = get_cfws(value) 2180 mime_version.append(token) 2181 if not value or value[0] != '.': 2182 if mime_version.major is not None: 2183 mime_version.defects.append(errors.InvalidHeaderDefect( 2184 "Incomplete MIME version; found only major number")) 2185 if value: 2186 mime_version.append(ValueTerminal(value, 'xtext')) 2187 return mime_version 2188 mime_version.append(ValueTerminal('.', 'version-separator')) 2189 value = value[1:] 2190 if value and value[0] in CFWS_LEADER: 2191 token, value = get_cfws(value) 2192 mime_version.append(token) 2193 if not value: 2194 if mime_version.major is not None: 2195 mime_version.defects.append(errors.InvalidHeaderDefect( 2196 "Incomplete MIME version; found only major number")) 2197 return mime_version 2198 digits = '' 2199 while value and value[0] not in CFWS_LEADER: 2200 digits += value[0] 2201 value = value[1:] 2202 if not digits.isdigit(): 2203 mime_version.defects.append(errors.InvalidHeaderDefect( 2204 "Expected MIME minor version number but found {!r}".format(digits))) 2205 mime_version.append(ValueTerminal(digits, 'xtext')) 2206 else: 2207 mime_version.minor = int(digits) 2208 mime_version.append(ValueTerminal(digits, 'digits')) 2209 if value and value[0] in CFWS_LEADER: 2210 token, value = get_cfws(value) 2211 mime_version.append(token) 2212 if value: 2213 mime_version.defects.append(errors.InvalidHeaderDefect( 2214 "Excess non-CFWS text after MIME version")) 2215 mime_version.append(ValueTerminal(value, 'xtext')) 2216 return mime_version 2217 2218def get_invalid_parameter(value): 2219 """ Read everything up to the next ';'. 2220 2221 This is outside the formal grammar. The InvalidParameter TokenList that is 2222 returned acts like a Parameter, but the data attributes are None. 2223 2224 """ 2225 invalid_parameter = InvalidParameter() 2226 while value and value[0] != ';': 2227 if value[0] in PHRASE_ENDS: 2228 invalid_parameter.append(ValueTerminal(value[0], 2229 'misplaced-special')) 2230 value = value[1:] 2231 else: 2232 token, value = get_phrase(value) 2233 invalid_parameter.append(token) 2234 return invalid_parameter, value 2235 2236def get_ttext(value): 2237 """ttext = <matches _ttext_matcher> 2238 2239 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's 2240 defects list if we find non-ttext characters. We also register defects for 2241 *any* non-printables even though the RFC doesn't exclude all of them, 2242 because we follow the spirit of RFC 5322. 2243 2244 """ 2245 m = _non_token_end_matcher(value) 2246 if not m: 2247 raise errors.HeaderParseError( 2248 "expected ttext but found '{}'".format(value)) 2249 ttext = m.group() 2250 value = value[len(ttext):] 2251 ttext = ValueTerminal(ttext, 'ttext') 2252 _validate_xtext(ttext) 2253 return ttext, value 2254 2255def get_token(value): 2256 """token = [CFWS] 1*ttext [CFWS] 2257 2258 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or 2259 tspecials. We also exclude tabs even though the RFC doesn't. 2260 2261 The RFC implies the CFWS but is not explicit about it in the BNF. 2262 2263 """ 2264 mtoken = Token() 2265 if value and value[0] in CFWS_LEADER: 2266 token, value = get_cfws(value) 2267 mtoken.append(token) 2268 if value and value[0] in TOKEN_ENDS: 2269 raise errors.HeaderParseError( 2270 "expected token but found '{}'".format(value)) 2271 token, value = get_ttext(value) 2272 mtoken.append(token) 2273 if value and value[0] in CFWS_LEADER: 2274 token, value = get_cfws(value) 2275 mtoken.append(token) 2276 return mtoken, value 2277 2278def get_attrtext(value): 2279 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) 2280 2281 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the 2282 token's defects list if we find non-attrtext characters. We also register 2283 defects for *any* non-printables even though the RFC doesn't exclude all of 2284 them, because we follow the spirit of RFC 5322. 2285 2286 """ 2287 m = _non_attribute_end_matcher(value) 2288 if not m: 2289 raise errors.HeaderParseError( 2290 "expected attrtext but found {!r}".format(value)) 2291 attrtext = m.group() 2292 value = value[len(attrtext):] 2293 attrtext = ValueTerminal(attrtext, 'attrtext') 2294 _validate_xtext(attrtext) 2295 return attrtext, value 2296 2297def get_attribute(value): 2298 """ [CFWS] 1*attrtext [CFWS] 2299 2300 This version of the BNF makes the CFWS explicit, and as usual we use a 2301 value terminal for the actual run of characters. The RFC equivalent of 2302 attrtext is the token characters, with the subtraction of '*', "'", and '%'. 2303 We include tab in the excluded set just as we do for token. 2304 2305 """ 2306 attribute = Attribute() 2307 if value and value[0] in CFWS_LEADER: 2308 token, value = get_cfws(value) 2309 attribute.append(token) 2310 if value and value[0] in ATTRIBUTE_ENDS: 2311 raise errors.HeaderParseError( 2312 "expected token but found '{}'".format(value)) 2313 token, value = get_attrtext(value) 2314 attribute.append(token) 2315 if value and value[0] in CFWS_LEADER: 2316 token, value = get_cfws(value) 2317 attribute.append(token) 2318 return attribute, value 2319 2320def get_extended_attrtext(value): 2321 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') 2322 2323 This is a special parsing routine so that we get a value that 2324 includes % escapes as a single string (which we decode as a single 2325 string later). 2326 2327 """ 2328 m = _non_extended_attribute_end_matcher(value) 2329 if not m: 2330 raise errors.HeaderParseError( 2331 "expected extended attrtext but found {!r}".format(value)) 2332 attrtext = m.group() 2333 value = value[len(attrtext):] 2334 attrtext = ValueTerminal(attrtext, 'extended-attrtext') 2335 _validate_xtext(attrtext) 2336 return attrtext, value 2337 2338def get_extended_attribute(value): 2339 """ [CFWS] 1*extended_attrtext [CFWS] 2340 2341 This is like the non-extended version except we allow % characters, so that 2342 we can pick up an encoded value as a single string. 2343 2344 """ 2345 # XXX: should we have an ExtendedAttribute TokenList? 2346 attribute = Attribute() 2347 if value and value[0] in CFWS_LEADER: 2348 token, value = get_cfws(value) 2349 attribute.append(token) 2350 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: 2351 raise errors.HeaderParseError( 2352 "expected token but found '{}'".format(value)) 2353 token, value = get_extended_attrtext(value) 2354 attribute.append(token) 2355 if value and value[0] in CFWS_LEADER: 2356 token, value = get_cfws(value) 2357 attribute.append(token) 2358 return attribute, value 2359 2360def get_section(value): 2361 """ '*' digits 2362 2363 The formal BNF is more complicated because leading 0s are not allowed. We 2364 check for that and add a defect. We also assume no CFWS is allowed between 2365 the '*' and the digits, though the RFC is not crystal clear on that. 2366 The caller should already have dealt with leading CFWS. 2367 2368 """ 2369 section = Section() 2370 if not value or value[0] != '*': 2371 raise errors.HeaderParseError("Expected section but found {}".format( 2372 value)) 2373 section.append(ValueTerminal('*', 'section-marker')) 2374 value = value[1:] 2375 if not value or not value[0].isdigit(): 2376 raise errors.HeaderParseError("Expected section number but " 2377 "found {}".format(value)) 2378 digits = '' 2379 while value and value[0].isdigit(): 2380 digits += value[0] 2381 value = value[1:] 2382 if digits[0] == '0' and digits != '0': 2383 section.defects.append(errors.InvalidHeaderDefect( 2384 "section number has an invalid leading 0")) 2385 section.number = int(digits) 2386 section.append(ValueTerminal(digits, 'digits')) 2387 return section, value 2388 2389 2390def get_value(value): 2391 """ quoted-string / attribute 2392 2393 """ 2394 v = Value() 2395 if not value: 2396 raise errors.HeaderParseError("Expected value but found end of string") 2397 leader = None 2398 if value[0] in CFWS_LEADER: 2399 leader, value = get_cfws(value) 2400 if not value: 2401 raise errors.HeaderParseError("Expected value but found " 2402 "only {}".format(leader)) 2403 if value[0] == '"': 2404 token, value = get_quoted_string(value) 2405 else: 2406 token, value = get_extended_attribute(value) 2407 if leader is not None: 2408 token[:0] = [leader] 2409 v.append(token) 2410 return v, value 2411 2412def get_parameter(value): 2413 """ attribute [section] ["*"] [CFWS] "=" value 2414 2415 The CFWS is implied by the RFC but not made explicit in the BNF. This 2416 simplified form of the BNF from the RFC is made to conform with the RFC BNF 2417 through some extra checks. We do it this way because it makes both error 2418 recovery and working with the resulting parse tree easier. 2419 """ 2420 # It is possible CFWS would also be implicitly allowed between the section 2421 # and the 'extended-attribute' marker (the '*') , but we've never seen that 2422 # in the wild and we will therefore ignore the possibility. 2423 param = Parameter() 2424 token, value = get_attribute(value) 2425 param.append(token) 2426 if not value or value[0] == ';': 2427 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " 2428 "name ({}) but no value".format(token))) 2429 return param, value 2430 if value[0] == '*': 2431 try: 2432 token, value = get_section(value) 2433 param.sectioned = True 2434 param.append(token) 2435 except errors.HeaderParseError: 2436 pass 2437 if not value: 2438 raise errors.HeaderParseError("Incomplete parameter") 2439 if value[0] == '*': 2440 param.append(ValueTerminal('*', 'extended-parameter-marker')) 2441 value = value[1:] 2442 param.extended = True 2443 if value[0] != '=': 2444 raise errors.HeaderParseError("Parameter not followed by '='") 2445 param.append(ValueTerminal('=', 'parameter-separator')) 2446 value = value[1:] 2447 leader = None 2448 if value and value[0] in CFWS_LEADER: 2449 token, value = get_cfws(value) 2450 param.append(token) 2451 remainder = None 2452 appendto = param 2453 if param.extended and value and value[0] == '"': 2454 # Now for some serious hackery to handle the common invalid case of 2455 # double quotes around an extended value. We also accept (with defect) 2456 # a value marked as encoded that isn't really. 2457 qstring, remainder = get_quoted_string(value) 2458 inner_value = qstring.stripped_value 2459 semi_valid = False 2460 if param.section_number == 0: 2461 if inner_value and inner_value[0] == "'": 2462 semi_valid = True 2463 else: 2464 token, rest = get_attrtext(inner_value) 2465 if rest and rest[0] == "'": 2466 semi_valid = True 2467 else: 2468 try: 2469 token, rest = get_extended_attrtext(inner_value) 2470 except: 2471 pass 2472 else: 2473 if not rest: 2474 semi_valid = True 2475 if semi_valid: 2476 param.defects.append(errors.InvalidHeaderDefect( 2477 "Quoted string value for extended parameter is invalid")) 2478 param.append(qstring) 2479 for t in qstring: 2480 if t.token_type == 'bare-quoted-string': 2481 t[:] = [] 2482 appendto = t 2483 break 2484 value = inner_value 2485 else: 2486 remainder = None 2487 param.defects.append(errors.InvalidHeaderDefect( 2488 "Parameter marked as extended but appears to have a " 2489 "quoted string value that is non-encoded")) 2490 if value and value[0] == "'": 2491 token = None 2492 else: 2493 token, value = get_value(value) 2494 if not param.extended or param.section_number > 0: 2495 if not value or value[0] != "'": 2496 appendto.append(token) 2497 if remainder is not None: 2498 assert not value, value 2499 value = remainder 2500 return param, value 2501 param.defects.append(errors.InvalidHeaderDefect( 2502 "Apparent initial-extended-value but attribute " 2503 "was not marked as extended or was not initial section")) 2504 if not value: 2505 # Assume the charset/lang is missing and the token is the value. 2506 param.defects.append(errors.InvalidHeaderDefect( 2507 "Missing required charset/lang delimiters")) 2508 appendto.append(token) 2509 if remainder is None: 2510 return param, value 2511 else: 2512 if token is not None: 2513 for t in token: 2514 if t.token_type == 'extended-attrtext': 2515 break 2516 t.token_type == 'attrtext' 2517 appendto.append(t) 2518 param.charset = t.value 2519 if value[0] != "'": 2520 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2521 "delimiter, but found {!r}".format(value)) 2522 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2523 value = value[1:] 2524 if value and value[0] != "'": 2525 token, value = get_attrtext(value) 2526 appendto.append(token) 2527 param.lang = token.value 2528 if not value or value[0] != "'": 2529 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2530 "delimiter, but found {}".format(value)) 2531 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2532 value = value[1:] 2533 if remainder is not None: 2534 # Treat the rest of value as bare quoted string content. 2535 v = Value() 2536 while value: 2537 if value[0] in WSP: 2538 token, value = get_fws(value) 2539 elif value[0] == '"': 2540 token = ValueTerminal('"', 'DQUOTE') 2541 value = value[1:] 2542 else: 2543 token, value = get_qcontent(value) 2544 v.append(token) 2545 token = v 2546 else: 2547 token, value = get_value(value) 2548 appendto.append(token) 2549 if remainder is not None: 2550 assert not value, value 2551 value = remainder 2552 return param, value 2553 2554def parse_mime_parameters(value): 2555 """ parameter *( ";" parameter ) 2556 2557 That BNF is meant to indicate this routine should only be called after 2558 finding and handling the leading ';'. There is no corresponding rule in 2559 the formal RFC grammar, but it is more convenient for us for the set of 2560 parameters to be treated as its own TokenList. 2561 2562 This is 'parse' routine because it consumes the remaining value, but it 2563 would never be called to parse a full header. Instead it is called to 2564 parse everything after the non-parameter value of a specific MIME header. 2565 2566 """ 2567 mime_parameters = MimeParameters() 2568 while value: 2569 try: 2570 token, value = get_parameter(value) 2571 mime_parameters.append(token) 2572 except errors.HeaderParseError as err: 2573 leader = None 2574 if value[0] in CFWS_LEADER: 2575 leader, value = get_cfws(value) 2576 if not value: 2577 mime_parameters.append(leader) 2578 return mime_parameters 2579 if value[0] == ';': 2580 if leader is not None: 2581 mime_parameters.append(leader) 2582 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2583 "parameter entry with no content")) 2584 else: 2585 token, value = get_invalid_parameter(value) 2586 if leader: 2587 token[:0] = [leader] 2588 mime_parameters.append(token) 2589 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2590 "invalid parameter {!r}".format(token))) 2591 if value and value[0] != ';': 2592 # Junk after the otherwise valid parameter. Mark it as 2593 # invalid, but it will have a value. 2594 param = mime_parameters[-1] 2595 param.token_type = 'invalid-parameter' 2596 token, value = get_invalid_parameter(value) 2597 param.extend(token) 2598 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2599 "parameter with invalid trailing text {!r}".format(token))) 2600 if value: 2601 # Must be a ';' at this point. 2602 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) 2603 value = value[1:] 2604 return mime_parameters 2605 2606def _find_mime_parameters(tokenlist, value): 2607 """Do our best to find the parameters in an invalid MIME header 2608 2609 """ 2610 while value and value[0] != ';': 2611 if value[0] in PHRASE_ENDS: 2612 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) 2613 value = value[1:] 2614 else: 2615 token, value = get_phrase(value) 2616 tokenlist.append(token) 2617 if not value: 2618 return 2619 tokenlist.append(ValueTerminal(';', 'parameter-separator')) 2620 tokenlist.append(parse_mime_parameters(value[1:])) 2621 2622def parse_content_type_header(value): 2623 """ maintype "/" subtype *( ";" parameter ) 2624 2625 The maintype and substype are tokens. Theoretically they could 2626 be checked against the official IANA list + x-token, but we 2627 don't do that. 2628 """ 2629 ctype = ContentType() 2630 recover = False 2631 if not value: 2632 ctype.defects.append(errors.HeaderMissingRequiredValue( 2633 "Missing content type specification")) 2634 return ctype 2635 try: 2636 token, value = get_token(value) 2637 except errors.HeaderParseError: 2638 ctype.defects.append(errors.InvalidHeaderDefect( 2639 "Expected content maintype but found {!r}".format(value))) 2640 _find_mime_parameters(ctype, value) 2641 return ctype 2642 ctype.append(token) 2643 # XXX: If we really want to follow the formal grammar we should make 2644 # mantype and subtype specialized TokenLists here. Probably not worth it. 2645 if not value or value[0] != '/': 2646 ctype.defects.append(errors.InvalidHeaderDefect( 2647 "Invalid content type")) 2648 if value: 2649 _find_mime_parameters(ctype, value) 2650 return ctype 2651 ctype.maintype = token.value.strip().lower() 2652 ctype.append(ValueTerminal('/', 'content-type-separator')) 2653 value = value[1:] 2654 try: 2655 token, value = get_token(value) 2656 except errors.HeaderParseError: 2657 ctype.defects.append(errors.InvalidHeaderDefect( 2658 "Expected content subtype but found {!r}".format(value))) 2659 _find_mime_parameters(ctype, value) 2660 return ctype 2661 ctype.append(token) 2662 ctype.subtype = token.value.strip().lower() 2663 if not value: 2664 return ctype 2665 if value[0] != ';': 2666 ctype.defects.append(errors.InvalidHeaderDefect( 2667 "Only parameters are valid after content type, but " 2668 "found {!r}".format(value))) 2669 # The RFC requires that a syntactically invalid content-type be treated 2670 # as text/plain. Perhaps we should postel this, but we should probably 2671 # only do that if we were checking the subtype value against IANA. 2672 del ctype.maintype, ctype.subtype 2673 _find_mime_parameters(ctype, value) 2674 return ctype 2675 ctype.append(ValueTerminal(';', 'parameter-separator')) 2676 ctype.append(parse_mime_parameters(value[1:])) 2677 return ctype 2678 2679def parse_content_disposition_header(value): 2680 """ disposition-type *( ";" parameter ) 2681 2682 """ 2683 disp_header = ContentDisposition() 2684 if not value: 2685 disp_header.defects.append(errors.HeaderMissingRequiredValue( 2686 "Missing content disposition")) 2687 return disp_header 2688 try: 2689 token, value = get_token(value) 2690 except errors.HeaderParseError: 2691 disp_header.defects.append(errors.InvalidHeaderDefect( 2692 "Expected content disposition but found {!r}".format(value))) 2693 _find_mime_parameters(disp_header, value) 2694 return disp_header 2695 disp_header.append(token) 2696 disp_header.content_disposition = token.value.strip().lower() 2697 if not value: 2698 return disp_header 2699 if value[0] != ';': 2700 disp_header.defects.append(errors.InvalidHeaderDefect( 2701 "Only parameters are valid after content disposition, but " 2702 "found {!r}".format(value))) 2703 _find_mime_parameters(disp_header, value) 2704 return disp_header 2705 disp_header.append(ValueTerminal(';', 'parameter-separator')) 2706 disp_header.append(parse_mime_parameters(value[1:])) 2707 return disp_header 2708 2709def parse_content_transfer_encoding_header(value): 2710 """ mechanism 2711 2712 """ 2713 # We should probably validate the values, since the list is fixed. 2714 cte_header = ContentTransferEncoding() 2715 if not value: 2716 cte_header.defects.append(errors.HeaderMissingRequiredValue( 2717 "Missing content transfer encoding")) 2718 return cte_header 2719 try: 2720 token, value = get_token(value) 2721 except errors.HeaderParseError: 2722 cte_header.defects.append(errors.InvalidHeaderDefect( 2723 "Expected content transfer encoding but found {!r}".format(value))) 2724 else: 2725 cte_header.append(token) 2726 cte_header.cte = token.value.strip().lower() 2727 if not value: 2728 return cte_header 2729 while value: 2730 cte_header.defects.append(errors.InvalidHeaderDefect( 2731 "Extra text after content transfer encoding")) 2732 if value[0] in PHRASE_ENDS: 2733 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) 2734 value = value[1:] 2735 else: 2736 token, value = get_phrase(value) 2737 cte_header.append(token) 2738 return cte_header 2739 2740 2741# 2742# Header folding 2743# 2744# Header folding is complex, with lots of rules and corner cases. The 2745# following code does its best to obey the rules and handle the corner 2746# cases, but you can be sure there are few bugs:) 2747# 2748# This folder generally canonicalizes as it goes, preferring the stringified 2749# version of each token. The tokens contain information that supports the 2750# folder, including which tokens can be encoded in which ways. 2751# 2752# Folded text is accumulated in a simple list of strings ('lines'), each 2753# one of which should be less than policy.max_line_length ('maxlen'). 2754# 2755 2756def _steal_trailing_WSP_if_exists(lines): 2757 wsp = '' 2758 if lines and lines[-1] and lines[-1][-1] in WSP: 2759 wsp = lines[-1][-1] 2760 lines[-1] = lines[-1][:-1] 2761 return wsp 2762 2763def _refold_parse_tree(parse_tree, *, policy): 2764 """Return string of contents of parse_tree folded according to RFC rules. 2765 2766 """ 2767 # max_line_length 0/None means no limit, ie: infinitely long. 2768 maxlen = policy.max_line_length or sys.maxsize 2769 encoding = 'utf-8' if policy.utf8 else 'us-ascii' 2770 lines = [''] 2771 last_ew = None 2772 wrap_as_ew_blocked = 0 2773 want_encoding = False 2774 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') 2775 parts = list(parse_tree) 2776 while parts: 2777 part = parts.pop(0) 2778 if part is end_ew_not_allowed: 2779 wrap_as_ew_blocked -= 1 2780 continue 2781 tstr = str(part) 2782 if part.token_type == 'ptext' and set(tstr) & SPECIALS: 2783 # Encode if tstr contains special characters. 2784 want_encoding = True 2785 try: 2786 tstr.encode(encoding) 2787 charset = encoding 2788 except UnicodeEncodeError: 2789 if any(isinstance(x, errors.UndecodableBytesDefect) 2790 for x in part.all_defects): 2791 charset = 'unknown-8bit' 2792 else: 2793 # If policy.utf8 is false this should really be taken from a 2794 # 'charset' property on the policy. 2795 charset = 'utf-8' 2796 want_encoding = True 2797 if part.token_type == 'mime-parameters': 2798 # Mime parameter folding (using RFC2231) is extra special. 2799 _fold_mime_parameters(part, lines, maxlen, encoding) 2800 continue 2801 if want_encoding and not wrap_as_ew_blocked: 2802 if not part.as_ew_allowed: 2803 want_encoding = False 2804 last_ew = None 2805 if part.syntactic_break: 2806 encoded_part = part.fold(policy=policy)[:-len(policy.linesep)] 2807 if policy.linesep not in encoded_part: 2808 # It fits on a single line 2809 if len(encoded_part) > maxlen - len(lines[-1]): 2810 # But not on this one, so start a new one. 2811 newline = _steal_trailing_WSP_if_exists(lines) 2812 # XXX what if encoded_part has no leading FWS? 2813 lines.append(newline) 2814 lines[-1] += encoded_part 2815 continue 2816 # Either this is not a major syntactic break, so we don't 2817 # want it on a line by itself even if it fits, or it 2818 # doesn't fit on a line by itself. Either way, fall through 2819 # to unpacking the subparts and wrapping them. 2820 if not hasattr(part, 'encode'): 2821 # It's not a Terminal, do each piece individually. 2822 parts = list(part) + parts 2823 else: 2824 # It's a terminal, wrap it as an encoded word, possibly 2825 # combining it with previously encoded words if allowed. 2826 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, 2827 part.ew_combine_allowed, charset) 2828 want_encoding = False 2829 continue 2830 if len(tstr) <= maxlen - len(lines[-1]): 2831 lines[-1] += tstr 2832 continue 2833 # This part is too long to fit. The RFC wants us to break at 2834 # "major syntactic breaks", so unless we don't consider this 2835 # to be one, check if it will fit on the next line by itself. 2836 if (part.syntactic_break and 2837 len(tstr) + 1 <= maxlen): 2838 newline = _steal_trailing_WSP_if_exists(lines) 2839 if newline or part.startswith_fws(): 2840 lines.append(newline + tstr) 2841 last_ew = None 2842 continue 2843 if not hasattr(part, 'encode'): 2844 # It's not a terminal, try folding the subparts. 2845 newparts = list(part) 2846 if not part.as_ew_allowed: 2847 wrap_as_ew_blocked += 1 2848 newparts.append(end_ew_not_allowed) 2849 parts = newparts + parts 2850 continue 2851 if part.as_ew_allowed and not wrap_as_ew_blocked: 2852 # It doesn't need CTE encoding, but encode it anyway so we can 2853 # wrap it. 2854 parts.insert(0, part) 2855 want_encoding = True 2856 continue 2857 # We can't figure out how to wrap, it, so give up. 2858 newline = _steal_trailing_WSP_if_exists(lines) 2859 if newline or part.startswith_fws(): 2860 lines.append(newline + tstr) 2861 else: 2862 # We can't fold it onto the next line either... 2863 lines[-1] += tstr 2864 return policy.linesep.join(lines) + policy.linesep 2865 2866def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): 2867 """Fold string to_encode into lines as encoded word, combining if allowed. 2868 Return the new value for last_ew, or None if ew_combine_allowed is False. 2869 2870 If there is already an encoded word in the last line of lines (indicated by 2871 a non-None value for last_ew) and ew_combine_allowed is true, decode the 2872 existing ew, combine it with to_encode, and re-encode. Otherwise, encode 2873 to_encode. In either case, split to_encode as necessary so that the 2874 encoded segments fit within maxlen. 2875 2876 """ 2877 if last_ew is not None and ew_combine_allowed: 2878 to_encode = str( 2879 get_unstructured(lines[-1][last_ew:] + to_encode)) 2880 lines[-1] = lines[-1][:last_ew] 2881 if to_encode[0] in WSP: 2882 # We're joining this to non-encoded text, so don't encode 2883 # the leading blank. 2884 leading_wsp = to_encode[0] 2885 to_encode = to_encode[1:] 2886 if (len(lines[-1]) == maxlen): 2887 lines.append(_steal_trailing_WSP_if_exists(lines)) 2888 lines[-1] += leading_wsp 2889 trailing_wsp = '' 2890 if to_encode[-1] in WSP: 2891 # Likewise for the trailing space. 2892 trailing_wsp = to_encode[-1] 2893 to_encode = to_encode[:-1] 2894 new_last_ew = len(lines[-1]) if last_ew is None else last_ew 2895 2896 encode_as = 'utf-8' if charset == 'us-ascii' else charset 2897 2898 # The RFC2047 chrome takes up 7 characters plus the length 2899 # of the charset name. 2900 chrome_len = len(encode_as) + 7 2901 2902 if (chrome_len + 1) >= maxlen: 2903 raise errors.HeaderParseError( 2904 "max_line_length is too small to fit an encoded word") 2905 2906 while to_encode: 2907 remaining_space = maxlen - len(lines[-1]) 2908 text_space = remaining_space - chrome_len 2909 if text_space <= 0: 2910 lines.append(' ') 2911 continue 2912 2913 to_encode_word = to_encode[:text_space] 2914 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 2915 excess = len(encoded_word) - remaining_space 2916 while excess > 0: 2917 # Since the chunk to encode is guaranteed to fit into less than 100 characters, 2918 # shrinking it by one at a time shouldn't take long. 2919 to_encode_word = to_encode_word[:-1] 2920 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 2921 excess = len(encoded_word) - remaining_space 2922 lines[-1] += encoded_word 2923 to_encode = to_encode[len(to_encode_word):] 2924 2925 if to_encode: 2926 lines.append(' ') 2927 new_last_ew = len(lines[-1]) 2928 lines[-1] += trailing_wsp 2929 return new_last_ew if ew_combine_allowed else None 2930 2931def _fold_mime_parameters(part, lines, maxlen, encoding): 2932 """Fold TokenList 'part' into the 'lines' list as mime parameters. 2933 2934 Using the decoded list of parameters and values, format them according to 2935 the RFC rules, including using RFC2231 encoding if the value cannot be 2936 expressed in 'encoding' and/or the parameter+value is too long to fit 2937 within 'maxlen'. 2938 2939 """ 2940 # Special case for RFC2231 encoding: start from decoded values and use 2941 # RFC2231 encoding iff needed. 2942 # 2943 # Note that the 1 and 2s being added to the length calculations are 2944 # accounting for the possibly-needed spaces and semicolons we'll be adding. 2945 # 2946 for name, value in part.params: 2947 # XXX What if this ';' puts us over maxlen the first time through the 2948 # loop? We should split the header value onto a newline in that case, 2949 # but to do that we need to recognize the need earlier or reparse the 2950 # header, so I'm going to ignore that bug for now. It'll only put us 2951 # one character over. 2952 if not lines[-1].rstrip().endswith(';'): 2953 lines[-1] += ';' 2954 charset = encoding 2955 error_handler = 'strict' 2956 try: 2957 value.encode(encoding) 2958 encoding_required = False 2959 except UnicodeEncodeError: 2960 encoding_required = True 2961 if utils._has_surrogates(value): 2962 charset = 'unknown-8bit' 2963 error_handler = 'surrogateescape' 2964 else: 2965 charset = 'utf-8' 2966 if encoding_required: 2967 encoded_value = urllib.parse.quote( 2968 value, safe='', errors=error_handler) 2969 tstr = "{}*={}''{}".format(name, charset, encoded_value) 2970 else: 2971 tstr = '{}={}'.format(name, quote_string(value)) 2972 if len(lines[-1]) + len(tstr) + 1 < maxlen: 2973 lines[-1] = lines[-1] + ' ' + tstr 2974 continue 2975 elif len(tstr) + 2 <= maxlen: 2976 lines.append(' ' + tstr) 2977 continue 2978 # We need multiple sections. We are allowed to mix encoded and 2979 # non-encoded sections, but we aren't going to. We'll encode them all. 2980 section = 0 2981 extra_chrome = charset + "''" 2982 while value: 2983 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) 2984 if maxlen <= chrome_len + 3: 2985 # We need room for the leading blank, the trailing semicolon, 2986 # and at least one character of the value. If we don't 2987 # have that, we'd be stuck, so in that case fall back to 2988 # the RFC standard width. 2989 maxlen = 78 2990 splitpoint = maxchars = maxlen - chrome_len - 2 2991 while True: 2992 partial = value[:splitpoint] 2993 encoded_value = urllib.parse.quote( 2994 partial, safe='', errors=error_handler) 2995 if len(encoded_value) <= maxchars: 2996 break 2997 splitpoint -= 1 2998 lines.append(" {}*{}*={}{}".format( 2999 name, section, extra_chrome, encoded_value)) 3000 extra_chrome = '' 3001 section += 1 3002 value = value[splitpoint:] 3003 if value: 3004 lines[-1] += ';' 3005