• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20"""
21
22import re
23import importlib.util
24import sys
25
26
27# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
28# Users can still do `from html import parser` and get the default behavior.
29spec = importlib.util.find_spec('html.parser')
30htmlparser = importlib.util.module_from_spec(spec)
31spec.loader.exec_module(htmlparser)
32sys.modules['htmlparser'] = htmlparser
33
34# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
35htmlparser.piclose = re.compile(r'\?>')
36# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
37htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
38# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
39# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
40# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
41htmlparser.incomplete = htmlparser.entityref
42# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
43htmlparser.locatestarttagend_tolerant = re.compile(r"""
44  <[a-zA-Z][^`\t\n\r\f />\x00]*       # tag name <= added backtick here
45  (?:[\s/]*                           # optional whitespace before attribute name
46    (?:(?<=['"\s/])[^`\s/>][^\s/=>]*  # attribute name <= added backtick here
47      (?:\s*=+\s*                     # value indicator
48        (?:'[^']*'                    # LITA-enclosed value
49          |"[^"]*"                    # LIT-enclosed value
50          |(?!['"])[^`>\s]*           # bare value <= added backtick here
51         )
52         (?:\s*,)*                    # possibly followed by a comma
53       )?(?:\s|/(?!>))*
54     )*
55   )?
56  \s*                                 # trailing whitespace
57""", re.VERBOSE)
58
59# Match a blank line at the start of a block of text (two newlines).
60# The newlines may be preceded by additional whitespace.
61blank_line_re = re.compile(r'^([ ]*\n){2}')
62
63
64class HTMLExtractor(htmlparser.HTMLParser):
65    """
66    Extract raw HTML from text.
67
68    The raw HTML is stored in the `htmlStash` of the Markdown instance passed
69    to `md` and the remaining text is stored in `cleandoc` as a list of strings.
70    """
71
72    def __init__(self, md, *args, **kwargs):
73        if 'convert_charrefs' not in kwargs:
74            kwargs['convert_charrefs'] = False
75
76        # Block tags that should contain no content (self closing)
77        self.empty_tags = set(['hr'])
78
79        # This calls self.reset
80        super().__init__(*args, **kwargs)
81        self.md = md
82
83    def reset(self):
84        """Reset this instance.  Loses all unprocessed data."""
85        self.inraw = False
86        self.intail = False
87        self.stack = []  # When inraw==True, stack contains a list of tags
88        self._cache = []
89        self.cleandoc = []
90        super().reset()
91
92    def close(self):
93        """Handle any buffered data."""
94        super().close()
95        if len(self.rawdata):
96            # Temp fix for https://bugs.python.org/issue41989
97            # TODO: remove this when the bug is fixed in all supported Python versions.
98            if self.convert_charrefs and not self.cdata_elem:  # pragma: no cover
99                self.handle_data(htmlparser.unescape(self.rawdata))
100            else:
101                self.handle_data(self.rawdata)
102        # Handle any unclosed tags.
103        if len(self._cache):
104            self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
105            self._cache = []
106
107    @property
108    def line_offset(self):
109        """Returns char index in self.rawdata for the start of the current line. """
110        if self.lineno > 1 and '\n' in self.rawdata:
111            m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
112            if m:
113                return m.end()
114            else:  # pragma: no cover
115                # Value of self.lineno must exceed total number of lines.
116                # Find index of beginning of last line.
117                return self.rawdata.rfind('\n')
118        return 0
119
120    def at_line_start(self):
121        """
122        Returns True if current position is at start of line.
123
124        Allows for up to three blank spaces at start of line.
125        """
126        if self.offset == 0:
127            return True
128        if self.offset > 3:
129            return False
130        # Confirm up to first 3 chars are whitespace
131        return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
132
133    def get_endtag_text(self, tag):
134        """
135        Returns the text of the end tag.
136
137        If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
138        """
139        # Attempt to extract actual tag from raw source text
140        start = self.line_offset + self.offset
141        m = htmlparser.endendtag.search(self.rawdata, start)
142        if m:
143            return self.rawdata[start:m.end()]
144        else:  # pragma: no cover
145            # Failed to extract from raw data. Assume well formed and lowercase.
146            return '</{}>'.format(tag)
147
148    def handle_starttag(self, tag, attrs):
149        # Handle tags that should always be empty and do not specify a closing tag
150        if tag in self.empty_tags:
151            self.handle_startendtag(tag, attrs)
152            return
153
154        if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
155            # Started a new raw block. Prepare stack.
156            self.inraw = True
157            self.cleandoc.append('\n')
158
159        text = self.get_starttag_text()
160        if self.inraw:
161            self.stack.append(tag)
162            self._cache.append(text)
163        else:
164            self.cleandoc.append(text)
165            if tag in self.CDATA_CONTENT_ELEMENTS:
166                # This is presumably a standalone tag in a code span (see #1036).
167                self.clear_cdata_mode()
168
169    def handle_endtag(self, tag):
170        text = self.get_endtag_text(tag)
171
172        if self.inraw:
173            self._cache.append(text)
174            if tag in self.stack:
175                # Remove tag from stack
176                while self.stack:
177                    if self.stack.pop() == tag:
178                        break
179            if len(self.stack) == 0:
180                # End of raw block.
181                if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
182                    # Preserve blank line and end of raw block.
183                    self._cache.append('\n')
184                else:
185                    # More content exists after endtag.
186                    self.intail = True
187                # Reset stack.
188                self.inraw = False
189                self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
190                # Insert blank line between this and next line.
191                self.cleandoc.append('\n\n')
192                self._cache = []
193        else:
194            self.cleandoc.append(text)
195
196    def handle_data(self, data):
197        if self.intail and '\n' in data:
198            self.intail = False
199        if self.inraw:
200            self._cache.append(data)
201        else:
202            self.cleandoc.append(data)
203
204    def handle_empty_tag(self, data, is_block):
205        """ Handle empty tags (`<data>`). """
206        if self.inraw or self.intail:
207            # Append this to the existing raw block
208            self._cache.append(data)
209        elif self.at_line_start() and is_block:
210            # Handle this as a standalone raw block
211            if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
212                # Preserve blank line after tag in raw block.
213                data += '\n'
214            else:
215                # More content exists after tag.
216                self.intail = True
217            item = self.cleandoc[-1] if self.cleandoc else ''
218            # If we only have one newline before block element, add another
219            if not item.endswith('\n\n') and item.endswith('\n'):
220                self.cleandoc.append('\n')
221            self.cleandoc.append(self.md.htmlStash.store(data))
222            # Insert blank line between this and next line.
223            self.cleandoc.append('\n\n')
224        else:
225            self.cleandoc.append(data)
226
227    def handle_startendtag(self, tag, attrs):
228        self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
229
230    def handle_charref(self, name):
231        self.handle_empty_tag('&#{};'.format(name), is_block=False)
232
233    def handle_entityref(self, name):
234        self.handle_empty_tag('&{};'.format(name), is_block=False)
235
236    def handle_comment(self, data):
237        self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
238
239    def handle_decl(self, data):
240        self.handle_empty_tag('<!{}>'.format(data), is_block=True)
241
242    def handle_pi(self, data):
243        self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
244
245    def unknown_decl(self, data):
246        end = ']]>' if data.startswith('CDATA[') else ']>'
247        self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
248
249    def parse_pi(self, i):
250        if self.at_line_start() or self.intail:
251            return super().parse_pi(i)
252        # This is not the beginning of a raw block so treat as plain data
253        # and avoid consuming any tags which may follow (see #1066).
254        self.handle_data('<?')
255        return i + 2
256
257    def parse_html_declaration(self, i):
258        if self.at_line_start() or self.intail:
259            return super().parse_html_declaration(i)
260        # This is not the beginning of a raw block so treat as plain data
261        # and avoid consuming any tags which may follow (see #1066).
262        self.handle_data('<!')
263        return i + 2
264
265    # The rest has been copied from base class in standard lib to address #1036.
266    # As __startag_text is private, all references to it must be in this subclass.
267    # The last few lines of parse_starttag are reversed so that handle_starttag
268    # can override cdata_mode in certain situations (in a code span).
269    __starttag_text = None
270
271    def get_starttag_text(self):
272        """Return full source of start tag: '<...>'."""
273        return self.__starttag_text
274
275    def parse_starttag(self, i):  # pragma: no cover
276        self.__starttag_text = None
277        endpos = self.check_for_whole_start_tag(i)
278        if endpos < 0:
279            return endpos
280        rawdata = self.rawdata
281        self.__starttag_text = rawdata[i:endpos]
282
283        # Now parse the data between i+1 and j into a tag and attrs
284        attrs = []
285        match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
286        assert match, 'unexpected call to parse_starttag()'
287        k = match.end()
288        self.lasttag = tag = match.group(1).lower()
289        while k < endpos:
290            m = htmlparser.attrfind_tolerant.match(rawdata, k)
291            if not m:
292                break
293            attrname, rest, attrvalue = m.group(1, 2, 3)
294            if not rest:
295                attrvalue = None
296            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
297                 attrvalue[:1] == '"' == attrvalue[-1:]:  # noqa: E127
298                attrvalue = attrvalue[1:-1]
299            if attrvalue:
300                attrvalue = htmlparser.unescape(attrvalue)
301            attrs.append((attrname.lower(), attrvalue))
302            k = m.end()
303
304        end = rawdata[k:endpos].strip()
305        if end not in (">", "/>"):
306            lineno, offset = self.getpos()
307            if "\n" in self.__starttag_text:
308                lineno = lineno + self.__starttag_text.count("\n")
309                offset = len(self.__starttag_text) \
310                         - self.__starttag_text.rfind("\n")  # noqa: E127
311            else:
312                offset = offset + len(self.__starttag_text)
313            self.handle_data(rawdata[i:endpos])
314            return endpos
315        if end.endswith('/>'):
316            # XHTML-style empty tag: <span attr="value" />
317            self.handle_startendtag(tag, attrs)
318        else:
319            # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
320            if tag in self.CDATA_CONTENT_ELEMENTS:
321                self.set_cdata_mode(tag)
322            self.handle_starttag(tag, attrs)
323        return endpos
324