• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line.  This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser.  It returns when there's nothing more it can do with the available
13data.  When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never raise a parsing
17exception.  Instead, when it finds something unexpected, it adds a 'defect' to
18the current message.  Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser', 'BytesFeedParser']
23
24import re
25
26from email import errors
27from email._policybase import compat32
28from collections import deque
29from io import StringIO
30
31NLCRE = re.compile(r'\r\n|\r|\n')
32NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
34NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
35# RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
36# except controls, SP, and ":".
37headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
38EMPTYSTRING = ''
39NL = '\n'
40boundaryendRE = re.compile(
41    r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
42
43NeedMoreData = object()
44
45
46class BufferedSubFile(object):
47    """A file-ish object that can have new data loaded into it.
48
49    You can also push and pop line-matching predicates onto a stack.  When the
50    current predicate matches the current line, a false EOF response
51    (i.e. empty string) is returned instead.  This lets the parser adhere to a
52    simple abstraction -- it parses until EOF closes the current message.
53    """
54    def __init__(self):
55        # Text stream of the last partial line pushed into this object.
56        # See issue 22233 for why this is a text stream and not a list.
57        self._partial = StringIO(newline='')
58        # A deque of full, pushed lines
59        self._lines = deque()
60        # The stack of false-EOF checking predicates.
61        self._eofstack = []
62        # A flag indicating whether the file has been closed or not.
63        self._closed = False
64
65    def push_eof_matcher(self, pred):
66        self._eofstack.append(pred)
67
68    def pop_eof_matcher(self):
69        return self._eofstack.pop()
70
71    def close(self):
72        # Don't forget any trailing partial line.
73        self._partial.seek(0)
74        self.pushlines(self._partial.readlines())
75        self._partial.seek(0)
76        self._partial.truncate()
77        self._closed = True
78
79    def readline(self):
80        if not self._lines:
81            if self._closed:
82                return ''
83            return NeedMoreData
84        # Pop the line off the stack and see if it matches the current
85        # false-EOF predicate.
86        line = self._lines.popleft()
87        # RFC 2046, section 5.1.2 requires us to recognize outer level
88        # boundaries at any level of inner nesting.  Do this, but be sure it's
89        # in the order of most to least nested.
90        for ateof in reversed(self._eofstack):
91            if ateof(line):
92                # We're at the false EOF.  But push the last line back first.
93                self._lines.appendleft(line)
94                return ''
95        return line
96
97    def unreadline(self, line):
98        # Let the consumer push a line back into the buffer.
99        assert line is not NeedMoreData
100        self._lines.appendleft(line)
101
102    def push(self, data):
103        """Push some new data into this object."""
104        self._partial.write(data)
105        if '\n' not in data and '\r' not in data:
106            # No new complete lines, wait for more.
107            return
108
109        # Crack into lines, preserving the linesep characters.
110        self._partial.seek(0)
111        parts = self._partial.readlines()
112        self._partial.seek(0)
113        self._partial.truncate()
114
115        # If the last element of the list does not end in a newline, then treat
116        # it as a partial line.  We only check for '\n' here because a line
117        # ending with '\r' might be a line that was split in the middle of a
118        # '\r\n' sequence (see bugs 1555570 and 1721862).
119        if not parts[-1].endswith('\n'):
120            self._partial.write(parts.pop())
121        self.pushlines(parts)
122
123    def pushlines(self, lines):
124        self._lines.extend(lines)
125
126    def __iter__(self):
127        return self
128
129    def __next__(self):
130        line = self.readline()
131        if line == '':
132            raise StopIteration
133        return line
134
135
136class FeedParser:
137    """A feed-style parser of email."""
138
139    def __init__(self, _factory=None, *, policy=compat32):
140        """_factory is called with no arguments to create a new message obj
141
142        The policy keyword specifies a policy object that controls a number of
143        aspects of the parser's operation.  The default policy maintains
144        backward compatibility.
145
146        """
147        self.policy = policy
148        self._old_style_factory = False
149        if _factory is None:
150            if policy.message_factory is None:
151                from email.message import Message
152                self._factory = Message
153            else:
154                self._factory = policy.message_factory
155        else:
156            self._factory = _factory
157            try:
158                _factory(policy=self.policy)
159            except TypeError:
160                # Assume this is an old-style factory
161                self._old_style_factory = True
162        self._input = BufferedSubFile()
163        self._msgstack = []
164        self._parse = self._parsegen().__next__
165        self._cur = None
166        self._last = None
167        self._headersonly = False
168
169    # Non-public interface for supporting Parser's headersonly flag
170    def _set_headersonly(self):
171        self._headersonly = True
172
173    def feed(self, data):
174        """Push more data into the parser."""
175        self._input.push(data)
176        self._call_parse()
177
178    def _call_parse(self):
179        try:
180            self._parse()
181        except StopIteration:
182            pass
183
184    def close(self):
185        """Parse all remaining data and return the root message object."""
186        self._input.close()
187        self._call_parse()
188        root = self._pop_message()
189        assert not self._msgstack
190        # Look for final set of defects
191        if root.get_content_maintype() == 'multipart' \
192               and not root.is_multipart() and not self._headersonly:
193            defect = errors.MultipartInvariantViolationDefect()
194            self.policy.handle_defect(root, defect)
195        return root
196
197    def _new_message(self):
198        if self._old_style_factory:
199            msg = self._factory()
200        else:
201            msg = self._factory(policy=self.policy)
202        if self._cur and self._cur.get_content_type() == 'multipart/digest':
203            msg.set_default_type('message/rfc822')
204        if self._msgstack:
205            self._msgstack[-1].attach(msg)
206        self._msgstack.append(msg)
207        self._cur = msg
208        self._last = msg
209
210    def _pop_message(self):
211        retval = self._msgstack.pop()
212        if self._msgstack:
213            self._cur = self._msgstack[-1]
214        else:
215            self._cur = None
216        return retval
217
218    def _parsegen(self):
219        # Create a new message and start by parsing headers.
220        self._new_message()
221        headers = []
222        # Collect the headers, searching for a line that doesn't match the RFC
223        # 2822 header or continuation pattern (including an empty line).
224        for line in self._input:
225            if line is NeedMoreData:
226                yield NeedMoreData
227                continue
228            if not headerRE.match(line):
229                # If we saw the RFC defined header/body separator
230                # (i.e. newline), just throw it away. Otherwise the line is
231                # part of the body so push it back.
232                if not NLCRE.match(line):
233                    defect = errors.MissingHeaderBodySeparatorDefect()
234                    self.policy.handle_defect(self._cur, defect)
235                    self._input.unreadline(line)
236                break
237            headers.append(line)
238        # Done with the headers, so parse them and figure out what we're
239        # supposed to see in the body of the message.
240        self._parse_headers(headers)
241        # Headers-only parsing is a backwards compatibility hack, which was
242        # necessary in the older parser, which could raise errors.  All
243        # remaining lines in the input are thrown into the message body.
244        if self._headersonly:
245            lines = []
246            while True:
247                line = self._input.readline()
248                if line is NeedMoreData:
249                    yield NeedMoreData
250                    continue
251                if line == '':
252                    break
253                lines.append(line)
254            self._cur.set_payload(EMPTYSTRING.join(lines))
255            return
256        if self._cur.get_content_type() == 'message/delivery-status':
257            # message/delivery-status contains blocks of headers separated by
258            # a blank line.  We'll represent each header block as a separate
259            # nested message object, but the processing is a bit different
260            # than standard message/* types because there is no body for the
261            # nested messages.  A blank line separates the subparts.
262            while True:
263                self._input.push_eof_matcher(NLCRE.match)
264                for retval in self._parsegen():
265                    if retval is NeedMoreData:
266                        yield NeedMoreData
267                        continue
268                    break
269                self._pop_message()
270                # We need to pop the EOF matcher in order to tell if we're at
271                # the end of the current file, not the end of the last block
272                # of message headers.
273                self._input.pop_eof_matcher()
274                # The input stream must be sitting at the newline or at the
275                # EOF.  We want to see if we're at the end of this subpart, so
276                # first consume the blank line, then test the next line to see
277                # if we're at this subpart's EOF.
278                while True:
279                    line = self._input.readline()
280                    if line is NeedMoreData:
281                        yield NeedMoreData
282                        continue
283                    break
284                while True:
285                    line = self._input.readline()
286                    if line is NeedMoreData:
287                        yield NeedMoreData
288                        continue
289                    break
290                if line == '':
291                    break
292                # Not at EOF so this is a line we're going to need.
293                self._input.unreadline(line)
294            return
295        if self._cur.get_content_maintype() == 'message':
296            # The message claims to be a message/* type, then what follows is
297            # another RFC 2822 message.
298            for retval in self._parsegen():
299                if retval is NeedMoreData:
300                    yield NeedMoreData
301                    continue
302                break
303            self._pop_message()
304            return
305        if self._cur.get_content_maintype() == 'multipart':
306            boundary = self._cur.get_boundary()
307            if boundary is None:
308                # The message /claims/ to be a multipart but it has not
309                # defined a boundary.  That's a problem which we'll handle by
310                # reading everything until the EOF and marking the message as
311                # defective.
312                defect = errors.NoBoundaryInMultipartDefect()
313                self.policy.handle_defect(self._cur, defect)
314                lines = []
315                for line in self._input:
316                    if line is NeedMoreData:
317                        yield NeedMoreData
318                        continue
319                    lines.append(line)
320                self._cur.set_payload(EMPTYSTRING.join(lines))
321                return
322            # Make sure a valid content type was specified per RFC 2045:6.4.
323            if (str(self._cur.get('content-transfer-encoding', '8bit')).lower()
324                    not in ('7bit', '8bit', 'binary')):
325                defect = errors.InvalidMultipartContentTransferEncodingDefect()
326                self.policy.handle_defect(self._cur, defect)
327            # Create a line match predicate which matches the inter-part
328            # boundary as well as the end-of-multipart boundary.  Don't push
329            # this onto the input stream until we've scanned past the
330            # preamble.
331            separator = '--' + boundary
332            def boundarymatch(line):
333                if not line.startswith(separator):
334                    return None
335                return boundaryendRE.match(line, len(separator))
336            capturing_preamble = True
337            preamble = []
338            linesep = False
339            close_boundary_seen = False
340            while True:
341                line = self._input.readline()
342                if line is NeedMoreData:
343                    yield NeedMoreData
344                    continue
345                if line == '':
346                    break
347                mo = boundarymatch(line)
348                if mo:
349                    # If we're looking at the end boundary, we're done with
350                    # this multipart.  If there was a newline at the end of
351                    # the closing boundary, then we need to initialize the
352                    # epilogue with the empty string (see below).
353                    if mo.group('end'):
354                        close_boundary_seen = True
355                        linesep = mo.group('linesep')
356                        break
357                    # We saw an inter-part boundary.  Were we in the preamble?
358                    if capturing_preamble:
359                        if preamble:
360                            # According to RFC 2046, the last newline belongs
361                            # to the boundary.
362                            lastline = preamble[-1]
363                            eolmo = NLCRE_eol.search(lastline)
364                            if eolmo:
365                                preamble[-1] = lastline[:-len(eolmo.group(0))]
366                            self._cur.preamble = EMPTYSTRING.join(preamble)
367                        capturing_preamble = False
368                        self._input.unreadline(line)
369                        continue
370                    # We saw a boundary separating two parts.  Consume any
371                    # multiple boundary lines that may be following.  Our
372                    # interpretation of RFC 2046 BNF grammar does not produce
373                    # body parts within such double boundaries.
374                    while True:
375                        line = self._input.readline()
376                        if line is NeedMoreData:
377                            yield NeedMoreData
378                            continue
379                        mo = boundarymatch(line)
380                        if not mo:
381                            self._input.unreadline(line)
382                            break
383                    # Recurse to parse this subpart; the input stream points
384                    # at the subpart's first line.
385                    self._input.push_eof_matcher(boundarymatch)
386                    for retval in self._parsegen():
387                        if retval is NeedMoreData:
388                            yield NeedMoreData
389                            continue
390                        break
391                    # Because of RFC 2046, the newline preceding the boundary
392                    # separator actually belongs to the boundary, not the
393                    # previous subpart's payload (or epilogue if the previous
394                    # part is a multipart).
395                    if self._last.get_content_maintype() == 'multipart':
396                        epilogue = self._last.epilogue
397                        if epilogue == '':
398                            self._last.epilogue = None
399                        elif epilogue is not None:
400                            mo = NLCRE_eol.search(epilogue)
401                            if mo:
402                                end = len(mo.group(0))
403                                self._last.epilogue = epilogue[:-end]
404                    else:
405                        payload = self._last._payload
406                        if isinstance(payload, str):
407                            mo = NLCRE_eol.search(payload)
408                            if mo:
409                                payload = payload[:-len(mo.group(0))]
410                                self._last._payload = payload
411                    self._input.pop_eof_matcher()
412                    self._pop_message()
413                    # Set the multipart up for newline cleansing, which will
414                    # happen if we're in a nested multipart.
415                    self._last = self._cur
416                else:
417                    # I think we must be in the preamble
418                    assert capturing_preamble
419                    preamble.append(line)
420            # We've seen either the EOF or the end boundary.  If we're still
421            # capturing the preamble, we never saw the start boundary.  Note
422            # that as a defect and store the captured text as the payload.
423            if capturing_preamble:
424                defect = errors.StartBoundaryNotFoundDefect()
425                self.policy.handle_defect(self._cur, defect)
426                self._cur.set_payload(EMPTYSTRING.join(preamble))
427                epilogue = []
428                for line in self._input:
429                    if line is NeedMoreData:
430                        yield NeedMoreData
431                        continue
432                self._cur.epilogue = EMPTYSTRING.join(epilogue)
433                return
434            # If we're not processing the preamble, then we might have seen
435            # EOF without seeing that end boundary...that is also a defect.
436            if not close_boundary_seen:
437                defect = errors.CloseBoundaryNotFoundDefect()
438                self.policy.handle_defect(self._cur, defect)
439                return
440            # Everything from here to the EOF is epilogue.  If the end boundary
441            # ended in a newline, we'll need to make sure the epilogue isn't
442            # None
443            if linesep:
444                epilogue = ['']
445            else:
446                epilogue = []
447            for line in self._input:
448                if line is NeedMoreData:
449                    yield NeedMoreData
450                    continue
451                epilogue.append(line)
452            # Any CRLF at the front of the epilogue is not technically part of
453            # the epilogue.  Also, watch out for an empty string epilogue,
454            # which means a single newline.
455            if epilogue:
456                firstline = epilogue[0]
457                bolmo = NLCRE_bol.match(firstline)
458                if bolmo:
459                    epilogue[0] = firstline[len(bolmo.group(0)):]
460            self._cur.epilogue = EMPTYSTRING.join(epilogue)
461            return
462        # Otherwise, it's some non-multipart type, so the entire rest of the
463        # file contents becomes the payload.
464        lines = []
465        for line in self._input:
466            if line is NeedMoreData:
467                yield NeedMoreData
468                continue
469            lines.append(line)
470        self._cur.set_payload(EMPTYSTRING.join(lines))
471
472    def _parse_headers(self, lines):
473        # Passed a list of lines that make up the headers for the current msg
474        lastheader = ''
475        lastvalue = []
476        for lineno, line in enumerate(lines):
477            # Check for continuation
478            if line[0] in ' \t':
479                if not lastheader:
480                    # The first line of the headers was a continuation.  This
481                    # is illegal, so let's note the defect, store the illegal
482                    # line, and ignore it for purposes of headers.
483                    defect = errors.FirstHeaderLineIsContinuationDefect(line)
484                    self.policy.handle_defect(self._cur, defect)
485                    continue
486                lastvalue.append(line)
487                continue
488            if lastheader:
489                self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
490                lastheader, lastvalue = '', []
491            # Check for envelope header, i.e. unix-from
492            if line.startswith('From '):
493                if lineno == 0:
494                    # Strip off the trailing newline
495                    mo = NLCRE_eol.search(line)
496                    if mo:
497                        line = line[:-len(mo.group(0))]
498                    self._cur.set_unixfrom(line)
499                    continue
500                elif lineno == len(lines) - 1:
501                    # Something looking like a unix-from at the end - it's
502                    # probably the first line of the body, so push back the
503                    # line and stop.
504                    self._input.unreadline(line)
505                    return
506                else:
507                    # Weirdly placed unix-from line.  Note this as a defect
508                    # and ignore it.
509                    defect = errors.MisplacedEnvelopeHeaderDefect(line)
510                    self._cur.defects.append(defect)
511                    continue
512            # Split the line on the colon separating field name from value.
513            # There will always be a colon, because if there wasn't the part of
514            # the parser that calls us would have started parsing the body.
515            i = line.find(':')
516
517            # If the colon is on the start of the line the header is clearly
518            # malformed, but we might be able to salvage the rest of the
519            # message. Track the error but keep going.
520            if i == 0:
521                defect = errors.InvalidHeaderDefect("Missing header name.")
522                self._cur.defects.append(defect)
523                continue
524
525            assert i>0, "_parse_headers fed line with no : and no leading WS"
526            lastheader = line[:i]
527            lastvalue = [line]
528        # Done with all the lines, so handle the last header.
529        if lastheader:
530            self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
531
532
533class BytesFeedParser(FeedParser):
534    """Like FeedParser, but feed accepts bytes."""
535
536    def feed(self, data):
537        super().feed(data.decode('ascii', 'surrogateescape'))
538