• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20"""
21
22import re
23import xml.etree.ElementTree as etree
24from . import util
25from . import inlinepatterns
26
27
28def build_treeprocessors(md, **kwargs):
29    """ Build the default treeprocessors for Markdown. """
30    treeprocessors = util.Registry()
31    treeprocessors.register(InlineProcessor(md), 'inline', 20)
32    treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
33    treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
34    return treeprocessors
35
36
37def isString(s):
38    """ Check if it's string """
39    if not isinstance(s, util.AtomicString):
40        return isinstance(s, str)
41    return False
42
43
44class Treeprocessor(util.Processor):
45    """
46    Treeprocessors are run on the ElementTree object before serialization.
47
48    Each Treeprocessor implements a "run" method that takes a pointer to an
49    ElementTree, modifies it as necessary and returns an ElementTree
50    object.
51
52    Treeprocessors must extend markdown.Treeprocessor.
53
54    """
55    def run(self, root):
56        """
57        Subclasses of Treeprocessor should implement a `run` method, which
58        takes a root ElementTree. This method can return another ElementTree
59        object, and the existing root ElementTree will be replaced, or it can
60        modify the current tree and return None.
61        """
62        pass  # pragma: no cover
63
64
65class InlineProcessor(Treeprocessor):
66    """
67    A Treeprocessor that traverses a tree, applying inline patterns.
68    """
69
70    def __init__(self, md):
71        self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
72        self.__placeholder_suffix = util.ETX
73        self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
74                                      + len(self.__placeholder_suffix)
75        self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
76        self.md = md
77        self.inlinePatterns = md.inlinePatterns
78        self.ancestors = []
79
80    def __makePlaceholder(self, type):
81        """ Generate a placeholder """
82        id = "%04d" % len(self.stashed_nodes)
83        hash = util.INLINE_PLACEHOLDER % id
84        return hash, id
85
86    def __findPlaceholder(self, data, index):
87        """
88        Extract id from data string, start from index
89
90        Keyword arguments:
91
92        * data: string
93        * index: index, from which we start search
94
95        Returns: placeholder id and string index, after the found placeholder.
96
97        """
98        m = self.__placeholder_re.search(data, index)
99        if m:
100            return m.group(1), m.end()
101        else:
102            return None, index + 1
103
104    def __stashNode(self, node, type):
105        """ Add node to stash """
106        placeholder, id = self.__makePlaceholder(type)
107        self.stashed_nodes[id] = node
108        return placeholder
109
110    def __handleInline(self, data, patternIndex=0):
111        """
112        Process string with inline patterns and replace it
113        with placeholders
114
115        Keyword arguments:
116
117        * data: A line of Markdown text
118        * patternIndex: The index of the inlinePattern to start with
119
120        Returns: String with placeholders.
121
122        """
123        if not isinstance(data, util.AtomicString):
124            startIndex = 0
125            count = len(self.inlinePatterns)
126            while patternIndex < count:
127                data, matched, startIndex = self.__applyPattern(
128                    self.inlinePatterns[patternIndex], data, patternIndex, startIndex
129                )
130                if not matched:
131                    patternIndex += 1
132        return data
133
134    def __processElementText(self, node, subnode, isText=True):
135        """
136        Process placeholders in Element.text or Element.tail
137        of Elements popped from self.stashed_nodes.
138
139        Keywords arguments:
140
141        * node: parent node
142        * subnode: processing node
143        * isText: bool variable, True - it's text, False - it's tail
144
145        Returns: None
146
147        """
148        if isText:
149            text = subnode.text
150            subnode.text = None
151        else:
152            text = subnode.tail
153            subnode.tail = None
154
155        childResult = self.__processPlaceholders(text, subnode, isText)
156
157        if not isText and node is not subnode:
158            pos = list(node).index(subnode) + 1
159        else:
160            pos = 0
161
162        childResult.reverse()
163        for newChild in childResult:
164            node.insert(pos, newChild[0])
165
166    def __processPlaceholders(self, data, parent, isText=True):
167        """
168        Process string with placeholders and generate ElementTree tree.
169
170        Keyword arguments:
171
172        * data: string with placeholders instead of ElementTree elements.
173        * parent: Element, which contains processing inline data
174
175        Returns: list with ElementTree elements with applied inline patterns.
176
177        """
178        def linkText(text):
179            if text:
180                if result:
181                    if result[-1][0].tail:
182                        result[-1][0].tail += text
183                    else:
184                        result[-1][0].tail = text
185                elif not isText:
186                    if parent.tail:
187                        parent.tail += text
188                    else:
189                        parent.tail = text
190                else:
191                    if parent.text:
192                        parent.text += text
193                    else:
194                        parent.text = text
195        result = []
196        strartIndex = 0
197        while data:
198            index = data.find(self.__placeholder_prefix, strartIndex)
199            if index != -1:
200                id, phEndIndex = self.__findPlaceholder(data, index)
201
202                if id in self.stashed_nodes:
203                    node = self.stashed_nodes.get(id)
204
205                    if index > 0:
206                        text = data[strartIndex:index]
207                        linkText(text)
208
209                    if not isString(node):  # it's Element
210                        for child in [node] + list(node):
211                            if child.tail:
212                                if child.tail.strip():
213                                    self.__processElementText(
214                                        node, child, False
215                                    )
216                            if child.text:
217                                if child.text.strip():
218                                    self.__processElementText(child, child)
219                    else:  # it's just a string
220                        linkText(node)
221                        strartIndex = phEndIndex
222                        continue
223
224                    strartIndex = phEndIndex
225                    result.append((node, self.ancestors[:]))
226
227                else:  # wrong placeholder
228                    end = index + len(self.__placeholder_prefix)
229                    linkText(data[strartIndex:end])
230                    strartIndex = end
231            else:
232                text = data[strartIndex:]
233                if isinstance(data, util.AtomicString):
234                    # We don't want to loose the AtomicString
235                    text = util.AtomicString(text)
236                linkText(text)
237                data = ""
238
239        return result
240
241    def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
242        """
243        Check if the line fits the pattern, create the necessary
244        elements, add it to stashed_nodes.
245
246        Keyword arguments:
247
248        * data: the text to be processed
249        * pattern: the pattern to be checked
250        * patternIndex: index of current pattern
251        * startIndex: string index, from which we start searching
252
253        Returns: String with placeholders instead of ElementTree elements.
254
255        """
256        new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
257
258        for exclude in pattern.ANCESTOR_EXCLUDES:
259            if exclude.lower() in self.ancestors:
260                return data, False, 0
261
262        if new_style:
263            match = None
264            # Since handleMatch may reject our first match,
265            # we iterate over the buffer looking for matches
266            # until we can't find any more.
267            for match in pattern.getCompiledRegExp().finditer(data, startIndex):
268                node, start, end = pattern.handleMatch(match, data)
269                if start is None or end is None:
270                    startIndex += match.end(0)
271                    match = None
272                    continue
273                break
274        else:  # pragma: no cover
275            match = pattern.getCompiledRegExp().match(data[startIndex:])
276            leftData = data[:startIndex]
277
278        if not match:
279            return data, False, 0
280
281        if not new_style:  # pragma: no cover
282            node = pattern.handleMatch(match)
283            start = match.start(0)
284            end = match.end(0)
285
286        if node is None:
287            return data, True, end
288
289        if not isString(node):
290            if not isinstance(node.text, util.AtomicString):
291                # We need to process current node too
292                for child in [node] + list(node):
293                    if not isString(node):
294                        if child.text:
295                            self.ancestors.append(child.tag.lower())
296                            child.text = self.__handleInline(
297                                child.text, patternIndex + 1
298                            )
299                            self.ancestors.pop()
300                        if child.tail:
301                            child.tail = self.__handleInline(
302                                child.tail, patternIndex
303                            )
304
305        placeholder = self.__stashNode(node, pattern.type())
306
307        if new_style:
308            return "{}{}{}".format(data[:start],
309                                   placeholder, data[end:]), True, 0
310        else:  # pragma: no cover
311            return "{}{}{}{}".format(leftData,
312                                     match.group(1),
313                                     placeholder, match.groups()[-1]), True, 0
314
315    def __build_ancestors(self, parent, parents):
316        """Build the ancestor list."""
317        ancestors = []
318        while parent is not None:
319            if parent is not None:
320                ancestors.append(parent.tag.lower())
321            parent = self.parent_map.get(parent)
322        ancestors.reverse()
323        parents.extend(ancestors)
324
325    def run(self, tree, ancestors=None):
326        """Apply inline patterns to a parsed Markdown tree.
327
328        Iterate over ElementTree, find elements with inline tag, apply inline
329        patterns and append newly created Elements to tree.  If you don't
330        want to process your data with inline patterns, instead of normal
331        string, use subclass AtomicString:
332
333            node.text = markdown.AtomicString("This will not be processed.")
334
335        Arguments:
336
337        * tree: ElementTree object, representing Markdown tree.
338        * ancestors: List of parent tag names that precede the tree node (if needed).
339
340        Returns: ElementTree object with applied inline patterns.
341
342        """
343        self.stashed_nodes = {}
344
345        # Ensure a valid parent list, but copy passed in lists
346        # to ensure we don't have the user accidentally change it on us.
347        tree_parents = [] if ancestors is None else ancestors[:]
348
349        self.parent_map = {c: p for p in tree.iter() for c in p}
350        stack = [(tree, tree_parents)]
351
352        while stack:
353            currElement, parents = stack.pop()
354
355            self.ancestors = parents
356            self.__build_ancestors(currElement, self.ancestors)
357
358            insertQueue = []
359            for child in currElement:
360                if child.text and not isinstance(
361                    child.text, util.AtomicString
362                ):
363                    self.ancestors.append(child.tag.lower())
364                    text = child.text
365                    child.text = None
366                    lst = self.__processPlaceholders(
367                        self.__handleInline(text), child
368                    )
369                    for item in lst:
370                        self.parent_map[item[0]] = child
371                    stack += lst
372                    insertQueue.append((child, lst))
373                    self.ancestors.pop()
374                if child.tail:
375                    tail = self.__handleInline(child.tail)
376                    dumby = etree.Element('d')
377                    child.tail = None
378                    tailResult = self.__processPlaceholders(tail, dumby, False)
379                    if dumby.tail:
380                        child.tail = dumby.tail
381                    pos = list(currElement).index(child) + 1
382                    tailResult.reverse()
383                    for newChild in tailResult:
384                        self.parent_map[newChild[0]] = currElement
385                        currElement.insert(pos, newChild[0])
386                if len(child):
387                    self.parent_map[child] = currElement
388                    stack.append((child, self.ancestors[:]))
389
390            for element, lst in insertQueue:
391                for i, obj in enumerate(lst):
392                    newChild = obj[0]
393                    element.insert(i, newChild)
394        return tree
395
396
397class PrettifyTreeprocessor(Treeprocessor):
398    """ Add linebreaks to the html document. """
399
400    def _prettifyETree(self, elem):
401        """ Recursively add linebreaks to ElementTree children. """
402
403        i = "\n"
404        if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
405            if (not elem.text or not elem.text.strip()) \
406                    and len(elem) and self.md.is_block_level(elem[0].tag):
407                elem.text = i
408            for e in elem:
409                if self.md.is_block_level(e.tag):
410                    self._prettifyETree(e)
411        if not elem.tail or not elem.tail.strip():
412            elem.tail = i
413
414    def run(self, root):
415        """ Add linebreaks to ElementTree root object. """
416
417        self._prettifyETree(root)
418        # Do <br />'s separately as they are often in the middle of
419        # inline content and missed by _prettifyETree.
420        brs = root.iter('br')
421        for br in brs:
422            if not br.tail or not br.tail.strip():
423                br.tail = '\n'
424            else:
425                br.tail = '\n%s' % br.tail
426        # Clean up extra empty lines at end of code blocks.
427        pres = root.iter('pre')
428        for pre in pres:
429            if len(pre) and pre[0].tag == 'code':
430                code = pre[0]
431                # Only prettify code containing text only
432                if not len(code) and code.text is not None:
433                    code.text = util.AtomicString(code.text.rstrip() + '\n')
434
435
436class UnescapeTreeprocessor(Treeprocessor):
437    """ Restore escaped chars """
438
439    RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
440
441    def _unescape(self, m):
442        return chr(int(m.group(1)))
443
444    def unescape(self, text):
445        return self.RE.sub(self._unescape, text)
446
447    def run(self, root):
448        """ Loop over all elements and unescape all text. """
449        for elem in root.iter():
450            # Unescape text content
451            if elem.text and not elem.tag == 'code':
452                elem.text = self.unescape(elem.text)
453            # Unescape tail content
454            if elem.tail:
455                elem.tail = self.unescape(elem.tail)
456            # Unescape attribute values
457            for key, value in elem.items():
458                elem.set(key, self.unescape(value))
459