1""" 2Python Markdown 3 4A Python implementation of John Gruber's Markdown. 5 6Documentation: https://python-markdown.github.io/ 7GitHub: https://github.com/Python-Markdown/markdown/ 8PyPI: https://pypi.org/project/Markdown/ 9 10Started by Manfred Stienstra (http://www.dwerg.net/). 11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 12Currently maintained by Waylan Limberg (https://github.com/waylan), 13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 14 15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) 16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 17Copyright 2004 Manfred Stienstra (the original version) 18 19License: BSD (see LICENSE.md for details). 20 21CORE MARKDOWN BLOCKPARSER 22=========================================================================== 23 24This parser handles basic parsing of Markdown blocks. It doesn't concern 25itself with inline elements such as **bold** or *italics*, but rather just 26catches blocks, lists, quotes, etc. 27 28The BlockParser is made up of a bunch of BlockProcessors, each handling a 29different type of block. Extensions may add/replace/remove BlockProcessors 30as they need to alter how markdown blocks are parsed. 31""" 32 33import logging 34import re 35import xml.etree.ElementTree as etree 36from . import util 37from .blockparser import BlockParser 38 39logger = logging.getLogger('MARKDOWN') 40 41 42def build_block_parser(md, **kwargs): 43 """ Build the default block parser used by Markdown. """ 44 parser = BlockParser(md) 45 parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100) 46 parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90) 47 parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80) 48 parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70) 49 parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60) 50 parser.blockprocessors.register(HRProcessor(parser), 'hr', 50) 51 parser.blockprocessors.register(OListProcessor(parser), 'olist', 40) 52 parser.blockprocessors.register(UListProcessor(parser), 'ulist', 30) 53 parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 20) 54 parser.blockprocessors.register(ReferenceProcessor(parser), 'reference', 15) 55 parser.blockprocessors.register(ParagraphProcessor(parser), 'paragraph', 10) 56 return parser 57 58 59class BlockProcessor: 60 """ Base class for block processors. 61 62 Each subclass will provide the methods below to work with the source and 63 tree. Each processor will need to define it's own ``test`` and ``run`` 64 methods. The ``test`` method should return True or False, to indicate 65 whether the current block should be processed by this processor. If the 66 test passes, the parser will call the processors ``run`` method. 67 68 """ 69 70 def __init__(self, parser): 71 self.parser = parser 72 self.tab_length = parser.md.tab_length 73 74 def lastChild(self, parent): 75 """ Return the last child of an etree element. """ 76 if len(parent): 77 return parent[-1] 78 else: 79 return None 80 81 def detab(self, text, length=None): 82 """ Remove a tab from the front of each line of the given text. """ 83 if length is None: 84 length = self.tab_length 85 newtext = [] 86 lines = text.split('\n') 87 for line in lines: 88 if line.startswith(' ' * length): 89 newtext.append(line[length:]) 90 elif not line.strip(): 91 newtext.append('') 92 else: 93 break 94 return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) 95 96 def looseDetab(self, text, level=1): 97 """ Remove a tab from front of lines but allowing dedented lines. """ 98 lines = text.split('\n') 99 for i in range(len(lines)): 100 if lines[i].startswith(' '*self.tab_length*level): 101 lines[i] = lines[i][self.tab_length*level:] 102 return '\n'.join(lines) 103 104 def test(self, parent, block): 105 """ Test for block type. Must be overridden by subclasses. 106 107 As the parser loops through processors, it will call the ``test`` 108 method on each to determine if the given block of text is of that 109 type. This method must return a boolean ``True`` or ``False``. The 110 actual method of testing is left to the needs of that particular 111 block type. It could be as simple as ``block.startswith(some_string)`` 112 or a complex regular expression. As the block type may be different 113 depending on the parent of the block (i.e. inside a list), the parent 114 etree element is also provided and may be used as part of the test. 115 116 Keywords: 117 118 * ``parent``: A etree element which will be the parent of the block. 119 * ``block``: A block of text from the source which has been split at 120 blank lines. 121 """ 122 pass # pragma: no cover 123 124 def run(self, parent, blocks): 125 """ Run processor. Must be overridden by subclasses. 126 127 When the parser determines the appropriate type of a block, the parser 128 will call the corresponding processor's ``run`` method. This method 129 should parse the individual lines of the block and append them to 130 the etree. 131 132 Note that both the ``parent`` and ``etree`` keywords are pointers 133 to instances of the objects which should be edited in place. Each 134 processor must make changes to the existing objects as there is no 135 mechanism to return new/different objects to replace them. 136 137 This means that this method should be adding SubElements or adding text 138 to the parent, and should remove (``pop``) or add (``insert``) items to 139 the list of blocks. 140 141 Keywords: 142 143 * ``parent``: A etree element which is the parent of the current block. 144 * ``blocks``: A list of all remaining blocks of the document. 145 """ 146 pass # pragma: no cover 147 148 149class ListIndentProcessor(BlockProcessor): 150 """ Process children of list items. 151 152 Example: 153 * a list item 154 process this part 155 156 or this part 157 158 """ 159 160 ITEM_TYPES = ['li'] 161 LIST_TYPES = ['ul', 'ol'] 162 163 def __init__(self, *args): 164 super().__init__(*args) 165 self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length) 166 167 def test(self, parent, block): 168 return block.startswith(' '*self.tab_length) and \ 169 not self.parser.state.isstate('detabbed') and \ 170 (parent.tag in self.ITEM_TYPES or 171 (len(parent) and parent[-1] is not None and 172 (parent[-1].tag in self.LIST_TYPES))) 173 174 def run(self, parent, blocks): 175 block = blocks.pop(0) 176 level, sibling = self.get_level(parent, block) 177 block = self.looseDetab(block, level) 178 179 self.parser.state.set('detabbed') 180 if parent.tag in self.ITEM_TYPES: 181 # It's possible that this parent has a 'ul' or 'ol' child list 182 # with a member. If that is the case, then that should be the 183 # parent. This is intended to catch the edge case of an indented 184 # list whose first member was parsed previous to this point 185 # see OListProcessor 186 if len(parent) and parent[-1].tag in self.LIST_TYPES: 187 self.parser.parseBlocks(parent[-1], [block]) 188 else: 189 # The parent is already a li. Just parse the child block. 190 self.parser.parseBlocks(parent, [block]) 191 elif sibling.tag in self.ITEM_TYPES: 192 # The sibling is a li. Use it as parent. 193 self.parser.parseBlocks(sibling, [block]) 194 elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES: 195 # The parent is a list (``ol`` or ``ul``) which has children. 196 # Assume the last child li is the parent of this block. 197 if sibling[-1].text: 198 # If the parent li has text, that text needs to be moved to a p 199 # The p must be 'inserted' at beginning of list in the event 200 # that other children already exist i.e.; a nested sublist. 201 p = etree.Element('p') 202 p.text = sibling[-1].text 203 sibling[-1].text = '' 204 sibling[-1].insert(0, p) 205 self.parser.parseChunk(sibling[-1], block) 206 else: 207 self.create_item(sibling, block) 208 self.parser.state.reset() 209 210 def create_item(self, parent, block): 211 """ Create a new li and parse the block with it as the parent. """ 212 li = etree.SubElement(parent, 'li') 213 self.parser.parseBlocks(li, [block]) 214 215 def get_level(self, parent, block): 216 """ Get level of indent based on list level. """ 217 # Get indent level 218 m = self.INDENT_RE.match(block) 219 if m: 220 indent_level = len(m.group(1))/self.tab_length 221 else: 222 indent_level = 0 223 if self.parser.state.isstate('list'): 224 # We're in a tightlist - so we already are at correct parent. 225 level = 1 226 else: 227 # We're in a looselist - so we need to find parent. 228 level = 0 229 # Step through children of tree to find matching indent level. 230 while indent_level > level: 231 child = self.lastChild(parent) 232 if (child is not None and 233 (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)): 234 if child.tag in self.LIST_TYPES: 235 level += 1 236 parent = child 237 else: 238 # No more child levels. If we're short of indent_level, 239 # we have a code block. So we stop here. 240 break 241 return level, parent 242 243 244class CodeBlockProcessor(BlockProcessor): 245 """ Process code blocks. """ 246 247 def test(self, parent, block): 248 return block.startswith(' '*self.tab_length) 249 250 def run(self, parent, blocks): 251 sibling = self.lastChild(parent) 252 block = blocks.pop(0) 253 theRest = '' 254 if (sibling is not None and sibling.tag == "pre" and 255 len(sibling) and sibling[0].tag == "code"): 256 # The previous block was a code block. As blank lines do not start 257 # new code blocks, append this block to the previous, adding back 258 # linebreaks removed from the split into a list. 259 code = sibling[0] 260 block, theRest = self.detab(block) 261 code.text = util.AtomicString( 262 '{}\n{}\n'.format(code.text, util.code_escape(block.rstrip())) 263 ) 264 else: 265 # This is a new codeblock. Create the elements and insert text. 266 pre = etree.SubElement(parent, 'pre') 267 code = etree.SubElement(pre, 'code') 268 block, theRest = self.detab(block) 269 code.text = util.AtomicString('%s\n' % util.code_escape(block.rstrip())) 270 if theRest: 271 # This block contained unindented line(s) after the first indented 272 # line. Insert these lines as the first block of the master blocks 273 # list for future processing. 274 blocks.insert(0, theRest) 275 276 277class BlockQuoteProcessor(BlockProcessor): 278 279 RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') 280 281 def test(self, parent, block): 282 return bool(self.RE.search(block)) and not util.nearing_recursion_limit() 283 284 def run(self, parent, blocks): 285 block = blocks.pop(0) 286 m = self.RE.search(block) 287 if m: 288 before = block[:m.start()] # Lines before blockquote 289 # Pass lines before blockquote in recursively for parsing first. 290 self.parser.parseBlocks(parent, [before]) 291 # Remove ``> `` from beginning of each line. 292 block = '\n'.join( 293 [self.clean(line) for line in block[m.start():].split('\n')] 294 ) 295 sibling = self.lastChild(parent) 296 if sibling is not None and sibling.tag == "blockquote": 297 # Previous block was a blockquote so set that as this blocks parent 298 quote = sibling 299 else: 300 # This is a new blockquote. Create a new parent element. 301 quote = etree.SubElement(parent, 'blockquote') 302 # Recursively parse block with blockquote as parent. 303 # change parser state so blockquotes embedded in lists use p tags 304 self.parser.state.set('blockquote') 305 self.parser.parseChunk(quote, block) 306 self.parser.state.reset() 307 308 def clean(self, line): 309 """ Remove ``>`` from beginning of a line. """ 310 m = self.RE.match(line) 311 if line.strip() == ">": 312 return "" 313 elif m: 314 return m.group(2) 315 else: 316 return line 317 318 319class OListProcessor(BlockProcessor): 320 """ Process ordered list blocks. """ 321 322 TAG = 'ol' 323 # The integer (python string) with which the lists starts (default=1) 324 # Eg: If list is initialized as) 325 # 3. Item 326 # The ol tag will get starts="3" attribute 327 STARTSWITH = '1' 328 # Lazy ol - ignore startswith 329 LAZY_OL = True 330 # List of allowed sibling tags. 331 SIBLING_TAGS = ['ol', 'ul'] 332 333 def __init__(self, parser): 334 super().__init__(parser) 335 # Detect an item (``1. item``). ``group(1)`` contains contents of item. 336 self.RE = re.compile(r'^[ ]{0,%d}\d+\.[ ]+(.*)' % (self.tab_length - 1)) 337 # Detect items on secondary lines. they can be of either list type. 338 self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.)|[*+-])[ ]+(.*)' % 339 (self.tab_length - 1)) 340 # Detect indented (nested) items of either type 341 self.INDENT_RE = re.compile(r'^[ ]{%d,%d}((\d+\.)|[*+-])[ ]+.*' % 342 (self.tab_length, self.tab_length * 2 - 1)) 343 344 def test(self, parent, block): 345 return bool(self.RE.match(block)) 346 347 def run(self, parent, blocks): 348 # Check fr multiple items in one block. 349 items = self.get_items(blocks.pop(0)) 350 sibling = self.lastChild(parent) 351 352 if sibling is not None and sibling.tag in self.SIBLING_TAGS: 353 # Previous block was a list item, so set that as parent 354 lst = sibling 355 # make sure previous item is in a p- if the item has text, 356 # then it isn't in a p 357 if lst[-1].text: 358 # since it's possible there are other children for this 359 # sibling, we can't just SubElement the p, we need to 360 # insert it as the first item. 361 p = etree.Element('p') 362 p.text = lst[-1].text 363 lst[-1].text = '' 364 lst[-1].insert(0, p) 365 # if the last item has a tail, then the tail needs to be put in a p 366 # likely only when a header is not followed by a blank line 367 lch = self.lastChild(lst[-1]) 368 if lch is not None and lch.tail: 369 p = etree.SubElement(lst[-1], 'p') 370 p.text = lch.tail.lstrip() 371 lch.tail = '' 372 373 # parse first block differently as it gets wrapped in a p. 374 li = etree.SubElement(lst, 'li') 375 self.parser.state.set('looselist') 376 firstitem = items.pop(0) 377 self.parser.parseBlocks(li, [firstitem]) 378 self.parser.state.reset() 379 elif parent.tag in ['ol', 'ul']: 380 # this catches the edge case of a multi-item indented list whose 381 # first item is in a blank parent-list item: 382 # * * subitem1 383 # * subitem2 384 # see also ListIndentProcessor 385 lst = parent 386 else: 387 # This is a new list so create parent with appropriate tag. 388 lst = etree.SubElement(parent, self.TAG) 389 # Check if a custom start integer is set 390 if not self.LAZY_OL and self.STARTSWITH != '1': 391 lst.attrib['start'] = self.STARTSWITH 392 393 self.parser.state.set('list') 394 # Loop through items in block, recursively parsing each with the 395 # appropriate parent. 396 for item in items: 397 if item.startswith(' '*self.tab_length): 398 # Item is indented. Parse with last item as parent 399 self.parser.parseBlocks(lst[-1], [item]) 400 else: 401 # New item. Create li and parse with it as parent 402 li = etree.SubElement(lst, 'li') 403 self.parser.parseBlocks(li, [item]) 404 self.parser.state.reset() 405 406 def get_items(self, block): 407 """ Break a block into list items. """ 408 items = [] 409 for line in block.split('\n'): 410 m = self.CHILD_RE.match(line) 411 if m: 412 # This is a new list item 413 # Check first item for the start index 414 if not items and self.TAG == 'ol': 415 # Detect the integer value of first list item 416 INTEGER_RE = re.compile(r'(\d+)') 417 self.STARTSWITH = INTEGER_RE.match(m.group(1)).group() 418 # Append to the list 419 items.append(m.group(3)) 420 elif self.INDENT_RE.match(line): 421 # This is an indented (possibly nested) item. 422 if items[-1].startswith(' '*self.tab_length): 423 # Previous item was indented. Append to that item. 424 items[-1] = '{}\n{}'.format(items[-1], line) 425 else: 426 items.append(line) 427 else: 428 # This is another line of previous item. Append to that item. 429 items[-1] = '{}\n{}'.format(items[-1], line) 430 return items 431 432 433class UListProcessor(OListProcessor): 434 """ Process unordered list blocks. """ 435 436 TAG = 'ul' 437 438 def __init__(self, parser): 439 super().__init__(parser) 440 # Detect an item (``1. item``). ``group(1)`` contains contents of item. 441 self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1)) 442 443 444class HashHeaderProcessor(BlockProcessor): 445 """ Process Hash Headers. """ 446 447 # Detect a header at start of any line in block 448 RE = re.compile(r'(?:^|\n)(?P<level>#{1,6})(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)') 449 450 def test(self, parent, block): 451 return bool(self.RE.search(block)) 452 453 def run(self, parent, blocks): 454 block = blocks.pop(0) 455 m = self.RE.search(block) 456 if m: 457 before = block[:m.start()] # All lines before header 458 after = block[m.end():] # All lines after header 459 if before: 460 # As the header was not the first line of the block and the 461 # lines before the header must be parsed first, 462 # recursively parse this lines as a block. 463 self.parser.parseBlocks(parent, [before]) 464 # Create header using named groups from RE 465 h = etree.SubElement(parent, 'h%d' % len(m.group('level'))) 466 h.text = m.group('header').strip() 467 if after: 468 # Insert remaining lines as first block for future parsing. 469 blocks.insert(0, after) 470 else: # pragma: no cover 471 # This should never happen, but just in case... 472 logger.warn("We've got a problem header: %r" % block) 473 474 475class SetextHeaderProcessor(BlockProcessor): 476 """ Process Setext-style Headers. """ 477 478 # Detect Setext-style header. Must be first 2 lines of block. 479 RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE) 480 481 def test(self, parent, block): 482 return bool(self.RE.match(block)) 483 484 def run(self, parent, blocks): 485 lines = blocks.pop(0).split('\n') 486 # Determine level. ``=`` is 1 and ``-`` is 2. 487 if lines[1].startswith('='): 488 level = 1 489 else: 490 level = 2 491 h = etree.SubElement(parent, 'h%d' % level) 492 h.text = lines[0].strip() 493 if len(lines) > 2: 494 # Block contains additional lines. Add to master blocks for later. 495 blocks.insert(0, '\n'.join(lines[2:])) 496 497 498class HRProcessor(BlockProcessor): 499 """ Process Horizontal Rules. """ 500 501 # Python's re module doesn't officially support atomic grouping. However you can fake it. 502 # See https://stackoverflow.com/a/13577411/866026 503 RE = r'^[ ]{0,3}(?=(?P<atomicgroup>(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$' 504 # Detect hr on any line of a block. 505 SEARCH_RE = re.compile(RE, re.MULTILINE) 506 507 def test(self, parent, block): 508 m = self.SEARCH_RE.search(block) 509 if m: 510 # Save match object on class instance so we can use it later. 511 self.match = m 512 return True 513 return False 514 515 def run(self, parent, blocks): 516 block = blocks.pop(0) 517 match = self.match 518 # Check for lines in block before hr. 519 prelines = block[:match.start()].rstrip('\n') 520 if prelines: 521 # Recursively parse lines before hr so they get parsed first. 522 self.parser.parseBlocks(parent, [prelines]) 523 # create hr 524 etree.SubElement(parent, 'hr') 525 # check for lines in block after hr. 526 postlines = block[match.end():].lstrip('\n') 527 if postlines: 528 # Add lines after hr to master blocks for later parsing. 529 blocks.insert(0, postlines) 530 531 532class EmptyBlockProcessor(BlockProcessor): 533 """ Process blocks that are empty or start with an empty line. """ 534 535 def test(self, parent, block): 536 return not block or block.startswith('\n') 537 538 def run(self, parent, blocks): 539 block = blocks.pop(0) 540 filler = '\n\n' 541 if block: 542 # Starts with empty line 543 # Only replace a single line. 544 filler = '\n' 545 # Save the rest for later. 546 theRest = block[1:] 547 if theRest: 548 # Add remaining lines to master blocks for later. 549 blocks.insert(0, theRest) 550 sibling = self.lastChild(parent) 551 if (sibling is not None and sibling.tag == 'pre' and 552 len(sibling) and sibling[0].tag == 'code'): 553 # Last block is a codeblock. Append to preserve whitespace. 554 sibling[0].text = util.AtomicString( 555 '{}{}'.format(sibling[0].text, filler) 556 ) 557 558 559class ReferenceProcessor(BlockProcessor): 560 """ Process link references. """ 561 RE = re.compile( 562 r'^[ ]{0,3}\[([^\[\]]*)\]:[ ]*\n?[ ]*([^\s]+)[ ]*(?:\n[ ]*)?((["\'])(.*)\4[ ]*|\((.*)\)[ ]*)?$', re.MULTILINE 563 ) 564 565 def test(self, parent, block): 566 return True 567 568 def run(self, parent, blocks): 569 block = blocks.pop(0) 570 m = self.RE.search(block) 571 if m: 572 id = m.group(1).strip().lower() 573 link = m.group(2).lstrip('<').rstrip('>') 574 title = m.group(5) or m.group(6) 575 self.parser.md.references[id] = (link, title) 576 if block[m.end():].strip(): 577 # Add any content after match back to blocks as separate block 578 blocks.insert(0, block[m.end():].lstrip('\n')) 579 if block[:m.start()].strip(): 580 # Add any content before match back to blocks as separate block 581 blocks.insert(0, block[:m.start()].rstrip('\n')) 582 return True 583 # No match. Restore block. 584 blocks.insert(0, block) 585 return False 586 587 588class ParagraphProcessor(BlockProcessor): 589 """ Process Paragraph blocks. """ 590 591 def test(self, parent, block): 592 return True 593 594 def run(self, parent, blocks): 595 block = blocks.pop(0) 596 if block.strip(): 597 # Not a blank block. Add to parent, otherwise throw it away. 598 if self.parser.state.isstate('list'): 599 # The parent is a tight-list. 600 # 601 # Check for any children. This will likely only happen in a 602 # tight-list when a header isn't followed by a blank line. 603 # For example: 604 # 605 # * # Header 606 # Line 2 of list item - not part of header. 607 sibling = self.lastChild(parent) 608 if sibling is not None: 609 # Insetrt after sibling. 610 if sibling.tail: 611 sibling.tail = '{}\n{}'.format(sibling.tail, block) 612 else: 613 sibling.tail = '\n%s' % block 614 else: 615 # Append to parent.text 616 if parent.text: 617 parent.text = '{}\n{}'.format(parent.text, block) 618 else: 619 parent.text = block.lstrip() 620 else: 621 # Create a regular paragraph 622 p = etree.SubElement(parent, 'p') 623 p.text = block.lstrip() 624