1""" 2Python Markdown 3 4A Python implementation of John Gruber's Markdown. 5 6Documentation: https://python-markdown.github.io/ 7GitHub: https://github.com/Python-Markdown/markdown/ 8PyPI: https://pypi.org/project/Markdown/ 9 10Started by Manfred Stienstra (http://www.dwerg.net/). 11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 12Currently maintained by Waylan Limberg (https://github.com/waylan), 13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 14 15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) 16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 17Copyright 2004 Manfred Stienstra (the original version) 18 19License: BSD (see LICENSE.md for details). 20""" 21 22import re 23import xml.etree.ElementTree as etree 24from . import util 25from . import inlinepatterns 26 27 28def build_treeprocessors(md, **kwargs): 29 """ Build the default treeprocessors for Markdown. """ 30 treeprocessors = util.Registry() 31 treeprocessors.register(InlineProcessor(md), 'inline', 20) 32 treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) 33 treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) 34 return treeprocessors 35 36 37def isString(s): 38 """ Check if it's string """ 39 if not isinstance(s, util.AtomicString): 40 return isinstance(s, str) 41 return False 42 43 44class Treeprocessor(util.Processor): 45 """ 46 Treeprocessors are run on the ElementTree object before serialization. 47 48 Each Treeprocessor implements a "run" method that takes a pointer to an 49 ElementTree, modifies it as necessary and returns an ElementTree 50 object. 51 52 Treeprocessors must extend markdown.Treeprocessor. 53 54 """ 55 def run(self, root): 56 """ 57 Subclasses of Treeprocessor should implement a `run` method, which 58 takes a root ElementTree. This method can return another ElementTree 59 object, and the existing root ElementTree will be replaced, or it can 60 modify the current tree and return None. 61 """ 62 pass # pragma: no cover 63 64 65class InlineProcessor(Treeprocessor): 66 """ 67 A Treeprocessor that traverses a tree, applying inline patterns. 68 """ 69 70 def __init__(self, md): 71 self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX 72 self.__placeholder_suffix = util.ETX 73 self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ 74 + len(self.__placeholder_suffix) 75 self.__placeholder_re = util.INLINE_PLACEHOLDER_RE 76 self.md = md 77 self.inlinePatterns = md.inlinePatterns 78 self.ancestors = [] 79 80 def __makePlaceholder(self, type): 81 """ Generate a placeholder """ 82 id = "%04d" % len(self.stashed_nodes) 83 hash = util.INLINE_PLACEHOLDER % id 84 return hash, id 85 86 def __findPlaceholder(self, data, index): 87 """ 88 Extract id from data string, start from index 89 90 Keyword arguments: 91 92 * data: string 93 * index: index, from which we start search 94 95 Returns: placeholder id and string index, after the found placeholder. 96 97 """ 98 m = self.__placeholder_re.search(data, index) 99 if m: 100 return m.group(1), m.end() 101 else: 102 return None, index + 1 103 104 def __stashNode(self, node, type): 105 """ Add node to stash """ 106 placeholder, id = self.__makePlaceholder(type) 107 self.stashed_nodes[id] = node 108 return placeholder 109 110 def __handleInline(self, data, patternIndex=0): 111 """ 112 Process string with inline patterns and replace it 113 with placeholders 114 115 Keyword arguments: 116 117 * data: A line of Markdown text 118 * patternIndex: The index of the inlinePattern to start with 119 120 Returns: String with placeholders. 121 122 """ 123 if not isinstance(data, util.AtomicString): 124 startIndex = 0 125 count = len(self.inlinePatterns) 126 while patternIndex < count: 127 data, matched, startIndex = self.__applyPattern( 128 self.inlinePatterns[patternIndex], data, patternIndex, startIndex 129 ) 130 if not matched: 131 patternIndex += 1 132 return data 133 134 def __processElementText(self, node, subnode, isText=True): 135 """ 136 Process placeholders in Element.text or Element.tail 137 of Elements popped from self.stashed_nodes. 138 139 Keywords arguments: 140 141 * node: parent node 142 * subnode: processing node 143 * isText: bool variable, True - it's text, False - it's tail 144 145 Returns: None 146 147 """ 148 if isText: 149 text = subnode.text 150 subnode.text = None 151 else: 152 text = subnode.tail 153 subnode.tail = None 154 155 childResult = self.__processPlaceholders(text, subnode, isText) 156 157 if not isText and node is not subnode: 158 pos = list(node).index(subnode) + 1 159 else: 160 pos = 0 161 162 childResult.reverse() 163 for newChild in childResult: 164 node.insert(pos, newChild[0]) 165 166 def __processPlaceholders(self, data, parent, isText=True): 167 """ 168 Process string with placeholders and generate ElementTree tree. 169 170 Keyword arguments: 171 172 * data: string with placeholders instead of ElementTree elements. 173 * parent: Element, which contains processing inline data 174 175 Returns: list with ElementTree elements with applied inline patterns. 176 177 """ 178 def linkText(text): 179 if text: 180 if result: 181 if result[-1][0].tail: 182 result[-1][0].tail += text 183 else: 184 result[-1][0].tail = text 185 elif not isText: 186 if parent.tail: 187 parent.tail += text 188 else: 189 parent.tail = text 190 else: 191 if parent.text: 192 parent.text += text 193 else: 194 parent.text = text 195 result = [] 196 strartIndex = 0 197 while data: 198 index = data.find(self.__placeholder_prefix, strartIndex) 199 if index != -1: 200 id, phEndIndex = self.__findPlaceholder(data, index) 201 202 if id in self.stashed_nodes: 203 node = self.stashed_nodes.get(id) 204 205 if index > 0: 206 text = data[strartIndex:index] 207 linkText(text) 208 209 if not isString(node): # it's Element 210 for child in [node] + list(node): 211 if child.tail: 212 if child.tail.strip(): 213 self.__processElementText( 214 node, child, False 215 ) 216 if child.text: 217 if child.text.strip(): 218 self.__processElementText(child, child) 219 else: # it's just a string 220 linkText(node) 221 strartIndex = phEndIndex 222 continue 223 224 strartIndex = phEndIndex 225 result.append((node, self.ancestors[:])) 226 227 else: # wrong placeholder 228 end = index + len(self.__placeholder_prefix) 229 linkText(data[strartIndex:end]) 230 strartIndex = end 231 else: 232 text = data[strartIndex:] 233 if isinstance(data, util.AtomicString): 234 # We don't want to loose the AtomicString 235 text = util.AtomicString(text) 236 linkText(text) 237 data = "" 238 239 return result 240 241 def __applyPattern(self, pattern, data, patternIndex, startIndex=0): 242 """ 243 Check if the line fits the pattern, create the necessary 244 elements, add it to stashed_nodes. 245 246 Keyword arguments: 247 248 * data: the text to be processed 249 * pattern: the pattern to be checked 250 * patternIndex: index of current pattern 251 * startIndex: string index, from which we start searching 252 253 Returns: String with placeholders instead of ElementTree elements. 254 255 """ 256 new_style = isinstance(pattern, inlinepatterns.InlineProcessor) 257 258 for exclude in pattern.ANCESTOR_EXCLUDES: 259 if exclude.lower() in self.ancestors: 260 return data, False, 0 261 262 if new_style: 263 match = None 264 # Since handleMatch may reject our first match, 265 # we iterate over the buffer looking for matches 266 # until we can't find any more. 267 for match in pattern.getCompiledRegExp().finditer(data, startIndex): 268 node, start, end = pattern.handleMatch(match, data) 269 if start is None or end is None: 270 startIndex += match.end(0) 271 match = None 272 continue 273 break 274 else: # pragma: no cover 275 match = pattern.getCompiledRegExp().match(data[startIndex:]) 276 leftData = data[:startIndex] 277 278 if not match: 279 return data, False, 0 280 281 if not new_style: # pragma: no cover 282 node = pattern.handleMatch(match) 283 start = match.start(0) 284 end = match.end(0) 285 286 if node is None: 287 return data, True, end 288 289 if not isString(node): 290 if not isinstance(node.text, util.AtomicString): 291 # We need to process current node too 292 for child in [node] + list(node): 293 if not isString(node): 294 if child.text: 295 self.ancestors.append(child.tag.lower()) 296 child.text = self.__handleInline( 297 child.text, patternIndex + 1 298 ) 299 self.ancestors.pop() 300 if child.tail: 301 child.tail = self.__handleInline( 302 child.tail, patternIndex 303 ) 304 305 placeholder = self.__stashNode(node, pattern.type()) 306 307 if new_style: 308 return "{}{}{}".format(data[:start], 309 placeholder, data[end:]), True, 0 310 else: # pragma: no cover 311 return "{}{}{}{}".format(leftData, 312 match.group(1), 313 placeholder, match.groups()[-1]), True, 0 314 315 def __build_ancestors(self, parent, parents): 316 """Build the ancestor list.""" 317 ancestors = [] 318 while parent is not None: 319 if parent is not None: 320 ancestors.append(parent.tag.lower()) 321 parent = self.parent_map.get(parent) 322 ancestors.reverse() 323 parents.extend(ancestors) 324 325 def run(self, tree, ancestors=None): 326 """Apply inline patterns to a parsed Markdown tree. 327 328 Iterate over ElementTree, find elements with inline tag, apply inline 329 patterns and append newly created Elements to tree. If you don't 330 want to process your data with inline patterns, instead of normal 331 string, use subclass AtomicString: 332 333 node.text = markdown.AtomicString("This will not be processed.") 334 335 Arguments: 336 337 * tree: ElementTree object, representing Markdown tree. 338 * ancestors: List of parent tag names that precede the tree node (if needed). 339 340 Returns: ElementTree object with applied inline patterns. 341 342 """ 343 self.stashed_nodes = {} 344 345 # Ensure a valid parent list, but copy passed in lists 346 # to ensure we don't have the user accidentally change it on us. 347 tree_parents = [] if ancestors is None else ancestors[:] 348 349 self.parent_map = {c: p for p in tree.iter() for c in p} 350 stack = [(tree, tree_parents)] 351 352 while stack: 353 currElement, parents = stack.pop() 354 355 self.ancestors = parents 356 self.__build_ancestors(currElement, self.ancestors) 357 358 insertQueue = [] 359 for child in currElement: 360 if child.text and not isinstance( 361 child.text, util.AtomicString 362 ): 363 self.ancestors.append(child.tag.lower()) 364 text = child.text 365 child.text = None 366 lst = self.__processPlaceholders( 367 self.__handleInline(text), child 368 ) 369 for item in lst: 370 self.parent_map[item[0]] = child 371 stack += lst 372 insertQueue.append((child, lst)) 373 self.ancestors.pop() 374 if child.tail: 375 tail = self.__handleInline(child.tail) 376 dumby = etree.Element('d') 377 child.tail = None 378 tailResult = self.__processPlaceholders(tail, dumby, False) 379 if dumby.tail: 380 child.tail = dumby.tail 381 pos = list(currElement).index(child) + 1 382 tailResult.reverse() 383 for newChild in tailResult: 384 self.parent_map[newChild[0]] = currElement 385 currElement.insert(pos, newChild[0]) 386 if len(child): 387 self.parent_map[child] = currElement 388 stack.append((child, self.ancestors[:])) 389 390 for element, lst in insertQueue: 391 for i, obj in enumerate(lst): 392 newChild = obj[0] 393 element.insert(i, newChild) 394 return tree 395 396 397class PrettifyTreeprocessor(Treeprocessor): 398 """ Add linebreaks to the html document. """ 399 400 def _prettifyETree(self, elem): 401 """ Recursively add linebreaks to ElementTree children. """ 402 403 i = "\n" 404 if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: 405 if (not elem.text or not elem.text.strip()) \ 406 and len(elem) and self.md.is_block_level(elem[0].tag): 407 elem.text = i 408 for e in elem: 409 if self.md.is_block_level(e.tag): 410 self._prettifyETree(e) 411 if not elem.tail or not elem.tail.strip(): 412 elem.tail = i 413 414 def run(self, root): 415 """ Add linebreaks to ElementTree root object. """ 416 417 self._prettifyETree(root) 418 # Do <br />'s separately as they are often in the middle of 419 # inline content and missed by _prettifyETree. 420 brs = root.iter('br') 421 for br in brs: 422 if not br.tail or not br.tail.strip(): 423 br.tail = '\n' 424 else: 425 br.tail = '\n%s' % br.tail 426 # Clean up extra empty lines at end of code blocks. 427 pres = root.iter('pre') 428 for pre in pres: 429 if len(pre) and pre[0].tag == 'code': 430 code = pre[0] 431 # Only prettify code containing text only 432 if not len(code) and code.text is not None: 433 code.text = util.AtomicString(code.text.rstrip() + '\n') 434 435 436class UnescapeTreeprocessor(Treeprocessor): 437 """ Restore escaped chars """ 438 439 RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) 440 441 def _unescape(self, m): 442 return chr(int(m.group(1))) 443 444 def unescape(self, text): 445 return self.RE.sub(self._unescape, text) 446 447 def run(self, root): 448 """ Loop over all elements and unescape all text. """ 449 for elem in root.iter(): 450 # Unescape text content 451 if elem.text and not elem.tag == 'code': 452 elem.text = self.unescape(elem.text) 453 # Unescape tail content 454 if elem.tail: 455 elem.tail = self.unescape(elem.tail) 456 # Unescape attribute values 457 for key, value in elem.items(): 458 elem.set(key, self.unescape(value)) 459