1# Copyright (C) 2004-2006 Python Software Foundation 2# Authors: Baxter, Wouters and Warsaw 3# Contact: email-sig@python.org 4 5"""FeedParser - An email feed parser. 6 7The feed parser implements an interface for incrementally parsing an email 8message, line by line. This has advantages for certain applications, such as 9those reading email messages off a socket. 10 11FeedParser.feed() is the primary interface for pushing new data into the 12parser. It returns when there's nothing more it can do with the available 13data. When you have no more data to push into the parser, call .close(). 14This completes the parsing and returns the root message object. 15 16The other advantage of this parser is that it will never raise a parsing 17exception. Instead, when it finds something unexpected, it adds a 'defect' to 18the current message. Defects are just instances that live on the message 19object's .defects attribute. 20""" 21 22__all__ = ['FeedParser', 'BytesFeedParser'] 23 24import re 25 26from email import errors 27from email._policybase import compat32 28from collections import deque 29from io import StringIO 30 31NLCRE = re.compile(r'\r\n|\r|\n') 32NLCRE_bol = re.compile(r'(\r\n|\r|\n)') 33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z') 34NLCRE_crack = re.compile(r'(\r\n|\r|\n)') 35# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 36# except controls, SP, and ":". 37headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') 38EMPTYSTRING = '' 39NL = '\n' 40boundaryendRE = re.compile( 41 r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 42 43NeedMoreData = object() 44 45 46class BufferedSubFile(object): 47 """A file-ish object that can have new data loaded into it. 48 49 You can also push and pop line-matching predicates onto a stack. When the 50 current predicate matches the current line, a false EOF response 51 (i.e. empty string) is returned instead. This lets the parser adhere to a 52 simple abstraction -- it parses until EOF closes the current message. 53 """ 54 def __init__(self): 55 # Text stream of the last partial line pushed into this object. 56 # See issue 22233 for why this is a text stream and not a list. 57 self._partial = StringIO(newline='') 58 # A deque of full, pushed lines 59 self._lines = deque() 60 # The stack of false-EOF checking predicates. 61 self._eofstack = [] 62 # A flag indicating whether the file has been closed or not. 63 self._closed = False 64 65 def push_eof_matcher(self, pred): 66 self._eofstack.append(pred) 67 68 def pop_eof_matcher(self): 69 return self._eofstack.pop() 70 71 def close(self): 72 # Don't forget any trailing partial line. 73 self._partial.seek(0) 74 self.pushlines(self._partial.readlines()) 75 self._partial.seek(0) 76 self._partial.truncate() 77 self._closed = True 78 79 def readline(self): 80 if not self._lines: 81 if self._closed: 82 return '' 83 return NeedMoreData 84 # Pop the line off the stack and see if it matches the current 85 # false-EOF predicate. 86 line = self._lines.popleft() 87 # RFC 2046, section 5.1.2 requires us to recognize outer level 88 # boundaries at any level of inner nesting. Do this, but be sure it's 89 # in the order of most to least nested. 90 for ateof in reversed(self._eofstack): 91 if ateof(line): 92 # We're at the false EOF. But push the last line back first. 93 self._lines.appendleft(line) 94 return '' 95 return line 96 97 def unreadline(self, line): 98 # Let the consumer push a line back into the buffer. 99 assert line is not NeedMoreData 100 self._lines.appendleft(line) 101 102 def push(self, data): 103 """Push some new data into this object.""" 104 self._partial.write(data) 105 if '\n' not in data and '\r' not in data: 106 # No new complete lines, wait for more. 107 return 108 109 # Crack into lines, preserving the linesep characters. 110 self._partial.seek(0) 111 parts = self._partial.readlines() 112 self._partial.seek(0) 113 self._partial.truncate() 114 115 # If the last element of the list does not end in a newline, then treat 116 # it as a partial line. We only check for '\n' here because a line 117 # ending with '\r' might be a line that was split in the middle of a 118 # '\r\n' sequence (see bugs 1555570 and 1721862). 119 if not parts[-1].endswith('\n'): 120 self._partial.write(parts.pop()) 121 self.pushlines(parts) 122 123 def pushlines(self, lines): 124 self._lines.extend(lines) 125 126 def __iter__(self): 127 return self 128 129 def __next__(self): 130 line = self.readline() 131 if line == '': 132 raise StopIteration 133 return line 134 135 136class FeedParser: 137 """A feed-style parser of email.""" 138 139 def __init__(self, _factory=None, *, policy=compat32): 140 """_factory is called with no arguments to create a new message obj 141 142 The policy keyword specifies a policy object that controls a number of 143 aspects of the parser's operation. The default policy maintains 144 backward compatibility. 145 146 """ 147 self.policy = policy 148 self._old_style_factory = False 149 if _factory is None: 150 if policy.message_factory is None: 151 from email.message import Message 152 self._factory = Message 153 else: 154 self._factory = policy.message_factory 155 else: 156 self._factory = _factory 157 try: 158 _factory(policy=self.policy) 159 except TypeError: 160 # Assume this is an old-style factory 161 self._old_style_factory = True 162 self._input = BufferedSubFile() 163 self._msgstack = [] 164 self._parse = self._parsegen().__next__ 165 self._cur = None 166 self._last = None 167 self._headersonly = False 168 169 # Non-public interface for supporting Parser's headersonly flag 170 def _set_headersonly(self): 171 self._headersonly = True 172 173 def feed(self, data): 174 """Push more data into the parser.""" 175 self._input.push(data) 176 self._call_parse() 177 178 def _call_parse(self): 179 try: 180 self._parse() 181 except StopIteration: 182 pass 183 184 def close(self): 185 """Parse all remaining data and return the root message object.""" 186 self._input.close() 187 self._call_parse() 188 root = self._pop_message() 189 assert not self._msgstack 190 # Look for final set of defects 191 if root.get_content_maintype() == 'multipart' \ 192 and not root.is_multipart() and not self._headersonly: 193 defect = errors.MultipartInvariantViolationDefect() 194 self.policy.handle_defect(root, defect) 195 return root 196 197 def _new_message(self): 198 if self._old_style_factory: 199 msg = self._factory() 200 else: 201 msg = self._factory(policy=self.policy) 202 if self._cur and self._cur.get_content_type() == 'multipart/digest': 203 msg.set_default_type('message/rfc822') 204 if self._msgstack: 205 self._msgstack[-1].attach(msg) 206 self._msgstack.append(msg) 207 self._cur = msg 208 self._last = msg 209 210 def _pop_message(self): 211 retval = self._msgstack.pop() 212 if self._msgstack: 213 self._cur = self._msgstack[-1] 214 else: 215 self._cur = None 216 return retval 217 218 def _parsegen(self): 219 # Create a new message and start by parsing headers. 220 self._new_message() 221 headers = [] 222 # Collect the headers, searching for a line that doesn't match the RFC 223 # 2822 header or continuation pattern (including an empty line). 224 for line in self._input: 225 if line is NeedMoreData: 226 yield NeedMoreData 227 continue 228 if not headerRE.match(line): 229 # If we saw the RFC defined header/body separator 230 # (i.e. newline), just throw it away. Otherwise the line is 231 # part of the body so push it back. 232 if not NLCRE.match(line): 233 defect = errors.MissingHeaderBodySeparatorDefect() 234 self.policy.handle_defect(self._cur, defect) 235 self._input.unreadline(line) 236 break 237 headers.append(line) 238 # Done with the headers, so parse them and figure out what we're 239 # supposed to see in the body of the message. 240 self._parse_headers(headers) 241 # Headers-only parsing is a backwards compatibility hack, which was 242 # necessary in the older parser, which could raise errors. All 243 # remaining lines in the input are thrown into the message body. 244 if self._headersonly: 245 lines = [] 246 while True: 247 line = self._input.readline() 248 if line is NeedMoreData: 249 yield NeedMoreData 250 continue 251 if line == '': 252 break 253 lines.append(line) 254 self._cur.set_payload(EMPTYSTRING.join(lines)) 255 return 256 if self._cur.get_content_type() == 'message/delivery-status': 257 # message/delivery-status contains blocks of headers separated by 258 # a blank line. We'll represent each header block as a separate 259 # nested message object, but the processing is a bit different 260 # than standard message/* types because there is no body for the 261 # nested messages. A blank line separates the subparts. 262 while True: 263 self._input.push_eof_matcher(NLCRE.match) 264 for retval in self._parsegen(): 265 if retval is NeedMoreData: 266 yield NeedMoreData 267 continue 268 break 269 self._pop_message() 270 # We need to pop the EOF matcher in order to tell if we're at 271 # the end of the current file, not the end of the last block 272 # of message headers. 273 self._input.pop_eof_matcher() 274 # The input stream must be sitting at the newline or at the 275 # EOF. We want to see if we're at the end of this subpart, so 276 # first consume the blank line, then test the next line to see 277 # if we're at this subpart's EOF. 278 while True: 279 line = self._input.readline() 280 if line is NeedMoreData: 281 yield NeedMoreData 282 continue 283 break 284 while True: 285 line = self._input.readline() 286 if line is NeedMoreData: 287 yield NeedMoreData 288 continue 289 break 290 if line == '': 291 break 292 # Not at EOF so this is a line we're going to need. 293 self._input.unreadline(line) 294 return 295 if self._cur.get_content_maintype() == 'message': 296 # The message claims to be a message/* type, then what follows is 297 # another RFC 2822 message. 298 for retval in self._parsegen(): 299 if retval is NeedMoreData: 300 yield NeedMoreData 301 continue 302 break 303 self._pop_message() 304 return 305 if self._cur.get_content_maintype() == 'multipart': 306 boundary = self._cur.get_boundary() 307 if boundary is None: 308 # The message /claims/ to be a multipart but it has not 309 # defined a boundary. That's a problem which we'll handle by 310 # reading everything until the EOF and marking the message as 311 # defective. 312 defect = errors.NoBoundaryInMultipartDefect() 313 self.policy.handle_defect(self._cur, defect) 314 lines = [] 315 for line in self._input: 316 if line is NeedMoreData: 317 yield NeedMoreData 318 continue 319 lines.append(line) 320 self._cur.set_payload(EMPTYSTRING.join(lines)) 321 return 322 # Make sure a valid content type was specified per RFC 2045:6.4. 323 if (str(self._cur.get('content-transfer-encoding', '8bit')).lower() 324 not in ('7bit', '8bit', 'binary')): 325 defect = errors.InvalidMultipartContentTransferEncodingDefect() 326 self.policy.handle_defect(self._cur, defect) 327 # Create a line match predicate which matches the inter-part 328 # boundary as well as the end-of-multipart boundary. Don't push 329 # this onto the input stream until we've scanned past the 330 # preamble. 331 separator = '--' + boundary 332 def boundarymatch(line): 333 if not line.startswith(separator): 334 return None 335 return boundaryendRE.match(line, len(separator)) 336 capturing_preamble = True 337 preamble = [] 338 linesep = False 339 close_boundary_seen = False 340 while True: 341 line = self._input.readline() 342 if line is NeedMoreData: 343 yield NeedMoreData 344 continue 345 if line == '': 346 break 347 mo = boundarymatch(line) 348 if mo: 349 # If we're looking at the end boundary, we're done with 350 # this multipart. If there was a newline at the end of 351 # the closing boundary, then we need to initialize the 352 # epilogue with the empty string (see below). 353 if mo.group('end'): 354 close_boundary_seen = True 355 linesep = mo.group('linesep') 356 break 357 # We saw an inter-part boundary. Were we in the preamble? 358 if capturing_preamble: 359 if preamble: 360 # According to RFC 2046, the last newline belongs 361 # to the boundary. 362 lastline = preamble[-1] 363 eolmo = NLCRE_eol.search(lastline) 364 if eolmo: 365 preamble[-1] = lastline[:-len(eolmo.group(0))] 366 self._cur.preamble = EMPTYSTRING.join(preamble) 367 capturing_preamble = False 368 self._input.unreadline(line) 369 continue 370 # We saw a boundary separating two parts. Consume any 371 # multiple boundary lines that may be following. Our 372 # interpretation of RFC 2046 BNF grammar does not produce 373 # body parts within such double boundaries. 374 while True: 375 line = self._input.readline() 376 if line is NeedMoreData: 377 yield NeedMoreData 378 continue 379 mo = boundarymatch(line) 380 if not mo: 381 self._input.unreadline(line) 382 break 383 # Recurse to parse this subpart; the input stream points 384 # at the subpart's first line. 385 self._input.push_eof_matcher(boundarymatch) 386 for retval in self._parsegen(): 387 if retval is NeedMoreData: 388 yield NeedMoreData 389 continue 390 break 391 # Because of RFC 2046, the newline preceding the boundary 392 # separator actually belongs to the boundary, not the 393 # previous subpart's payload (or epilogue if the previous 394 # part is a multipart). 395 if self._last.get_content_maintype() == 'multipart': 396 epilogue = self._last.epilogue 397 if epilogue == '': 398 self._last.epilogue = None 399 elif epilogue is not None: 400 mo = NLCRE_eol.search(epilogue) 401 if mo: 402 end = len(mo.group(0)) 403 self._last.epilogue = epilogue[:-end] 404 else: 405 payload = self._last._payload 406 if isinstance(payload, str): 407 mo = NLCRE_eol.search(payload) 408 if mo: 409 payload = payload[:-len(mo.group(0))] 410 self._last._payload = payload 411 self._input.pop_eof_matcher() 412 self._pop_message() 413 # Set the multipart up for newline cleansing, which will 414 # happen if we're in a nested multipart. 415 self._last = self._cur 416 else: 417 # I think we must be in the preamble 418 assert capturing_preamble 419 preamble.append(line) 420 # We've seen either the EOF or the end boundary. If we're still 421 # capturing the preamble, we never saw the start boundary. Note 422 # that as a defect and store the captured text as the payload. 423 if capturing_preamble: 424 defect = errors.StartBoundaryNotFoundDefect() 425 self.policy.handle_defect(self._cur, defect) 426 self._cur.set_payload(EMPTYSTRING.join(preamble)) 427 epilogue = [] 428 for line in self._input: 429 if line is NeedMoreData: 430 yield NeedMoreData 431 continue 432 self._cur.epilogue = EMPTYSTRING.join(epilogue) 433 return 434 # If we're not processing the preamble, then we might have seen 435 # EOF without seeing that end boundary...that is also a defect. 436 if not close_boundary_seen: 437 defect = errors.CloseBoundaryNotFoundDefect() 438 self.policy.handle_defect(self._cur, defect) 439 return 440 # Everything from here to the EOF is epilogue. If the end boundary 441 # ended in a newline, we'll need to make sure the epilogue isn't 442 # None 443 if linesep: 444 epilogue = [''] 445 else: 446 epilogue = [] 447 for line in self._input: 448 if line is NeedMoreData: 449 yield NeedMoreData 450 continue 451 epilogue.append(line) 452 # Any CRLF at the front of the epilogue is not technically part of 453 # the epilogue. Also, watch out for an empty string epilogue, 454 # which means a single newline. 455 if epilogue: 456 firstline = epilogue[0] 457 bolmo = NLCRE_bol.match(firstline) 458 if bolmo: 459 epilogue[0] = firstline[len(bolmo.group(0)):] 460 self._cur.epilogue = EMPTYSTRING.join(epilogue) 461 return 462 # Otherwise, it's some non-multipart type, so the entire rest of the 463 # file contents becomes the payload. 464 lines = [] 465 for line in self._input: 466 if line is NeedMoreData: 467 yield NeedMoreData 468 continue 469 lines.append(line) 470 self._cur.set_payload(EMPTYSTRING.join(lines)) 471 472 def _parse_headers(self, lines): 473 # Passed a list of lines that make up the headers for the current msg 474 lastheader = '' 475 lastvalue = [] 476 for lineno, line in enumerate(lines): 477 # Check for continuation 478 if line[0] in ' \t': 479 if not lastheader: 480 # The first line of the headers was a continuation. This 481 # is illegal, so let's note the defect, store the illegal 482 # line, and ignore it for purposes of headers. 483 defect = errors.FirstHeaderLineIsContinuationDefect(line) 484 self.policy.handle_defect(self._cur, defect) 485 continue 486 lastvalue.append(line) 487 continue 488 if lastheader: 489 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 490 lastheader, lastvalue = '', [] 491 # Check for envelope header, i.e. unix-from 492 if line.startswith('From '): 493 if lineno == 0: 494 # Strip off the trailing newline 495 mo = NLCRE_eol.search(line) 496 if mo: 497 line = line[:-len(mo.group(0))] 498 self._cur.set_unixfrom(line) 499 continue 500 elif lineno == len(lines) - 1: 501 # Something looking like a unix-from at the end - it's 502 # probably the first line of the body, so push back the 503 # line and stop. 504 self._input.unreadline(line) 505 return 506 else: 507 # Weirdly placed unix-from line. Note this as a defect 508 # and ignore it. 509 defect = errors.MisplacedEnvelopeHeaderDefect(line) 510 self._cur.defects.append(defect) 511 continue 512 # Split the line on the colon separating field name from value. 513 # There will always be a colon, because if there wasn't the part of 514 # the parser that calls us would have started parsing the body. 515 i = line.find(':') 516 517 # If the colon is on the start of the line the header is clearly 518 # malformed, but we might be able to salvage the rest of the 519 # message. Track the error but keep going. 520 if i == 0: 521 defect = errors.InvalidHeaderDefect("Missing header name.") 522 self._cur.defects.append(defect) 523 continue 524 525 assert i>0, "_parse_headers fed line with no : and no leading WS" 526 lastheader = line[:i] 527 lastvalue = [line] 528 # Done with all the lines, so handle the last header. 529 if lastheader: 530 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 531 532 533class BytesFeedParser(FeedParser): 534 """Like FeedParser, but feed accepts bytes.""" 535 536 def feed(self, data): 537 super().feed(data.decode('ascii', 'surrogateescape')) 538