1""" 2CORE MARKDOWN BLOCKPARSER 3============================================================================= 4 5This parser handles basic parsing of Markdown blocks. It doesn't concern itself 6with inline elements such as **bold** or *italics*, but rather just catches 7blocks, lists, quotes, etc. 8 9The BlockParser is made up of a bunch of BlockProssors, each handling a 10different type of block. Extensions may add/replace/remove BlockProcessors 11as they need to alter how markdown blocks are parsed. 12 13""" 14 15import re 16import markdown 17 18class BlockProcessor: 19 """ Base class for block processors. 20 21 Each subclass will provide the methods below to work with the source and 22 tree. Each processor will need to define it's own ``test`` and ``run`` 23 methods. The ``test`` method should return True or False, to indicate 24 whether the current block should be processed by this processor. If the 25 test passes, the parser will call the processors ``run`` method. 26 27 """ 28 29 def __init__(self, parser=None): 30 self.parser = parser 31 32 def lastChild(self, parent): 33 """ Return the last child of an etree element. """ 34 if len(parent): 35 return parent[-1] 36 else: 37 return None 38 39 def detab(self, text): 40 """ Remove a tab from the front of each line of the given text. """ 41 newtext = [] 42 lines = text.split('\n') 43 for line in lines: 44 if line.startswith(' '*markdown.TAB_LENGTH): 45 newtext.append(line[markdown.TAB_LENGTH:]) 46 elif not line.strip(): 47 newtext.append('') 48 else: 49 break 50 return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) 51 52 def looseDetab(self, text, level=1): 53 """ Remove a tab from front of lines but allowing dedented lines. """ 54 lines = text.split('\n') 55 for i in range(len(lines)): 56 if lines[i].startswith(' '*markdown.TAB_LENGTH*level): 57 lines[i] = lines[i][markdown.TAB_LENGTH*level:] 58 return '\n'.join(lines) 59 60 def test(self, parent, block): 61 """ Test for block type. Must be overridden by subclasses. 62 63 As the parser loops through processors, it will call the ``test`` method 64 on each to determine if the given block of text is of that type. This 65 method must return a boolean ``True`` or ``False``. The actual method of 66 testing is left to the needs of that particular block type. It could 67 be as simple as ``block.startswith(some_string)`` or a complex regular 68 expression. As the block type may be different depending on the parent 69 of the block (i.e. inside a list), the parent etree element is also 70 provided and may be used as part of the test. 71 72 Keywords: 73 74 * ``parent``: A etree element which will be the parent of the block. 75 * ``block``: A block of text from the source which has been split at 76 blank lines. 77 """ 78 pass 79 80 def run(self, parent, blocks): 81 """ Run processor. Must be overridden by subclasses. 82 83 When the parser determines the appropriate type of a block, the parser 84 will call the corresponding processor's ``run`` method. This method 85 should parse the individual lines of the block and append them to 86 the etree. 87 88 Note that both the ``parent`` and ``etree`` keywords are pointers 89 to instances of the objects which should be edited in place. Each 90 processor must make changes to the existing objects as there is no 91 mechanism to return new/different objects to replace them. 92 93 This means that this method should be adding SubElements or adding text 94 to the parent, and should remove (``pop``) or add (``insert``) items to 95 the list of blocks. 96 97 Keywords: 98 99 * ``parent``: A etree element which is the parent of the current block. 100 * ``blocks``: A list of all remaining blocks of the document. 101 """ 102 pass 103 104 105class ListIndentProcessor(BlockProcessor): 106 """ Process children of list items. 107 108 Example: 109 * a list item 110 process this part 111 112 or this part 113 114 """ 115 116 INDENT_RE = re.compile(r'^(([ ]{%s})+)'% markdown.TAB_LENGTH) 117 ITEM_TYPES = ['li'] 118 LIST_TYPES = ['ul', 'ol'] 119 120 def test(self, parent, block): 121 return block.startswith(' '*markdown.TAB_LENGTH) and \ 122 not self.parser.state.isstate('detabbed') and \ 123 (parent.tag in self.ITEM_TYPES or \ 124 (len(parent) and parent[-1] and \ 125 (parent[-1].tag in self.LIST_TYPES) 126 ) 127 ) 128 129 def run(self, parent, blocks): 130 block = blocks.pop(0) 131 level, sibling = self.get_level(parent, block) 132 block = self.looseDetab(block, level) 133 134 self.parser.state.set('detabbed') 135 if parent.tag in self.ITEM_TYPES: 136 # The parent is already a li. Just parse the child block. 137 self.parser.parseBlocks(parent, [block]) 138 elif sibling.tag in self.ITEM_TYPES: 139 # The sibling is a li. Use it as parent. 140 self.parser.parseBlocks(sibling, [block]) 141 elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES: 142 # The parent is a list (``ol`` or ``ul``) which has children. 143 # Assume the last child li is the parent of this block. 144 if sibling[-1].text: 145 # If the parent li has text, that text needs to be moved to a p 146 block = '%s\n\n%s' % (sibling[-1].text, block) 147 sibling[-1].text = '' 148 self.parser.parseChunk(sibling[-1], block) 149 else: 150 self.create_item(sibling, block) 151 self.parser.state.reset() 152 153 def create_item(self, parent, block): 154 """ Create a new li and parse the block with it as the parent. """ 155 li = markdown.etree.SubElement(parent, 'li') 156 self.parser.parseBlocks(li, [block]) 157 158 def get_level(self, parent, block): 159 """ Get level of indent based on list level. """ 160 # Get indent level 161 m = self.INDENT_RE.match(block) 162 if m: 163 indent_level = len(m.group(1))/markdown.TAB_LENGTH 164 else: 165 indent_level = 0 166 if self.parser.state.isstate('list'): 167 # We're in a tightlist - so we already are at correct parent. 168 level = 1 169 else: 170 # We're in a looselist - so we need to find parent. 171 level = 0 172 # Step through children of tree to find matching indent level. 173 while indent_level > level: 174 child = self.lastChild(parent) 175 if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES): 176 if child.tag in self.LIST_TYPES: 177 level += 1 178 parent = child 179 else: 180 # No more child levels. If we're short of indent_level, 181 # we have a code block. So we stop here. 182 break 183 return level, parent 184 185 186class CodeBlockProcessor(BlockProcessor): 187 """ Process code blocks. """ 188 189 def test(self, parent, block): 190 return block.startswith(' '*markdown.TAB_LENGTH) 191 192 def run(self, parent, blocks): 193 sibling = self.lastChild(parent) 194 block = blocks.pop(0) 195 theRest = '' 196 if sibling and sibling.tag == "pre" and len(sibling) \ 197 and sibling[0].tag == "code": 198 # The previous block was a code block. As blank lines do not start 199 # new code blocks, append this block to the previous, adding back 200 # linebreaks removed from the split into a list. 201 code = sibling[0] 202 block, theRest = self.detab(block) 203 code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip())) 204 else: 205 # This is a new codeblock. Create the elements and insert text. 206 pre = markdown.etree.SubElement(parent, 'pre') 207 code = markdown.etree.SubElement(pre, 'code') 208 block, theRest = self.detab(block) 209 code.text = markdown.AtomicString('%s\n' % block.rstrip()) 210 if theRest: 211 # This block contained unindented line(s) after the first indented 212 # line. Insert these lines as the first block of the master blocks 213 # list for future processing. 214 blocks.insert(0, theRest) 215 216 217class BlockQuoteProcessor(BlockProcessor): 218 219 RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') 220 221 def test(self, parent, block): 222 return bool(self.RE.search(block)) 223 224 def run(self, parent, blocks): 225 block = blocks.pop(0) 226 m = self.RE.search(block) 227 if m: 228 before = block[:m.start()] # Lines before blockquote 229 # Pass lines before blockquote in recursively for parsing forst. 230 self.parser.parseBlocks(parent, [before]) 231 # Remove ``> `` from begining of each line. 232 block = '\n'.join([self.clean(line) for line in 233 block[m.start():].split('\n')]) 234 sibling = self.lastChild(parent) 235 if sibling and sibling.tag == "blockquote": 236 # Previous block was a blockquote so set that as this blocks parent 237 quote = sibling 238 else: 239 # This is a new blockquote. Create a new parent element. 240 quote = markdown.etree.SubElement(parent, 'blockquote') 241 # Recursively parse block with blockquote as parent. 242 self.parser.parseChunk(quote, block) 243 244 def clean(self, line): 245 """ Remove ``>`` from beginning of a line. """ 246 m = self.RE.match(line) 247 if line.strip() == ">": 248 return "" 249 elif m: 250 return m.group(2) 251 else: 252 return line 253 254class OListProcessor(BlockProcessor): 255 """ Process ordered list blocks. """ 256 257 TAG = 'ol' 258 # Detect an item (``1. item``). ``group(1)`` contains contents of item. 259 RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)') 260 # Detect items on secondary lines. they can be of either list type. 261 CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)') 262 # Detect indented (nested) items of either type 263 INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*') 264 265 def test(self, parent, block): 266 return bool(self.RE.match(block)) 267 268 def run(self, parent, blocks): 269 # Check fr multiple items in one block. 270 items = self.get_items(blocks.pop(0)) 271 sibling = self.lastChild(parent) 272 if sibling and sibling.tag in ['ol', 'ul']: 273 # Previous block was a list item, so set that as parent 274 lst = sibling 275 # make sure previous item is in a p. 276 if len(lst) and lst[-1].text and not len(lst[-1]): 277 p = markdown.etree.SubElement(lst[-1], 'p') 278 p.text = lst[-1].text 279 lst[-1].text = '' 280 # parse first block differently as it gets wrapped in a p. 281 li = markdown.etree.SubElement(lst, 'li') 282 self.parser.state.set('looselist') 283 firstitem = items.pop(0) 284 self.parser.parseBlocks(li, [firstitem]) 285 self.parser.state.reset() 286 else: 287 # This is a new list so create parent with appropriate tag. 288 lst = markdown.etree.SubElement(parent, self.TAG) 289 self.parser.state.set('list') 290 # Loop through items in block, recursively parsing each with the 291 # appropriate parent. 292 for item in items: 293 if item.startswith(' '*markdown.TAB_LENGTH): 294 # Item is indented. Parse with last item as parent 295 self.parser.parseBlocks(lst[-1], [item]) 296 else: 297 # New item. Create li and parse with it as parent 298 li = markdown.etree.SubElement(lst, 'li') 299 self.parser.parseBlocks(li, [item]) 300 self.parser.state.reset() 301 302 def get_items(self, block): 303 """ Break a block into list items. """ 304 items = [] 305 for line in block.split('\n'): 306 m = self.CHILD_RE.match(line) 307 if m: 308 # This is a new item. Append 309 items.append(m.group(3)) 310 elif self.INDENT_RE.match(line): 311 # This is an indented (possibly nested) item. 312 if items[-1].startswith(' '*markdown.TAB_LENGTH): 313 # Previous item was indented. Append to that item. 314 items[-1] = '%s\n%s' % (items[-1], line) 315 else: 316 items.append(line) 317 else: 318 # This is another line of previous item. Append to that item. 319 items[-1] = '%s\n%s' % (items[-1], line) 320 return items 321 322 323class UListProcessor(OListProcessor): 324 """ Process unordered list blocks. """ 325 326 TAG = 'ul' 327 RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)') 328 329 330class HashHeaderProcessor(BlockProcessor): 331 """ Process Hash Headers. """ 332 333 # Detect a header at start of any line in block 334 RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)') 335 336 def test(self, parent, block): 337 return bool(self.RE.search(block)) 338 339 def run(self, parent, blocks): 340 block = blocks.pop(0) 341 m = self.RE.search(block) 342 if m: 343 before = block[:m.start()] # All lines before header 344 after = block[m.end():] # All lines after header 345 if before: 346 # As the header was not the first line of the block and the 347 # lines before the header must be parsed first, 348 # recursively parse this lines as a block. 349 self.parser.parseBlocks(parent, [before]) 350 # Create header using named groups from RE 351 h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level'))) 352 h.text = m.group('header').strip() 353 if after: 354 # Insert remaining lines as first block for future parsing. 355 blocks.insert(0, after) 356 else: 357 # This should never happen, but just in case... 358 message(CRITICAL, "We've got a problem header!") 359 360 361class SetextHeaderProcessor(BlockProcessor): 362 """ Process Setext-style Headers. """ 363 364 # Detect Setext-style header. Must be first 2 lines of block. 365 RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE) 366 367 def test(self, parent, block): 368 return bool(self.RE.match(block)) 369 370 def run(self, parent, blocks): 371 lines = blocks.pop(0).split('\n') 372 # Determine level. ``=`` is 1 and ``-`` is 2. 373 if lines[1].startswith('='): 374 level = 1 375 else: 376 level = 2 377 h = markdown.etree.SubElement(parent, 'h%d' % level) 378 h.text = lines[0].strip() 379 if len(lines) > 2: 380 # Block contains additional lines. Add to master blocks for later. 381 blocks.insert(0, '\n'.join(lines[2:])) 382 383 384class HRProcessor(BlockProcessor): 385 """ Process Horizontal Rules. """ 386 387 RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*' 388 # Detect hr on any line of a block. 389 SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE) 390 # Match a hr on a single line of text. 391 MATCH_RE = re.compile(r'^%s$' % RE) 392 393 def test(self, parent, block): 394 return bool(self.SEARCH_RE.search(block)) 395 396 def run(self, parent, blocks): 397 lines = blocks.pop(0).split('\n') 398 prelines = [] 399 # Check for lines in block before hr. 400 for line in lines: 401 m = self.MATCH_RE.match(line) 402 if m: 403 break 404 else: 405 prelines.append(line) 406 if len(prelines): 407 # Recursively parse lines before hr so they get parsed first. 408 self.parser.parseBlocks(parent, ['\n'.join(prelines)]) 409 # create hr 410 hr = markdown.etree.SubElement(parent, 'hr') 411 # check for lines in block after hr. 412 lines = lines[len(prelines)+1:] 413 if len(lines): 414 # Add lines after hr to master blocks for later parsing. 415 blocks.insert(0, '\n'.join(lines)) 416 417 418class EmptyBlockProcessor(BlockProcessor): 419 """ Process blocks and start with an empty line. """ 420 421 # Detect a block that only contains whitespace 422 # or only whitespace on the first line. 423 RE = re.compile(r'^\s*\n') 424 425 def test(self, parent, block): 426 return bool(self.RE.match(block)) 427 428 def run(self, parent, blocks): 429 block = blocks.pop(0) 430 m = self.RE.match(block) 431 if m: 432 # Add remaining line to master blocks for later. 433 blocks.insert(0, block[m.end():]) 434 sibling = self.lastChild(parent) 435 if sibling and sibling.tag == 'pre' and sibling[0] and \ 436 sibling[0].tag == 'code': 437 # Last block is a codeblock. Append to preserve whitespace. 438 sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text ) 439 440 441class ParagraphProcessor(BlockProcessor): 442 """ Process Paragraph blocks. """ 443 444 def test(self, parent, block): 445 return True 446 447 def run(self, parent, blocks): 448 block = blocks.pop(0) 449 if block.strip(): 450 # Not a blank block. Add to parent, otherwise throw it away. 451 if self.parser.state.isstate('list'): 452 # The parent is a tight-list. Append to parent.text 453 if parent.text: 454 parent.text = '%s\n%s' % (parent.text, block) 455 else: 456 parent.text = block.lstrip() 457 else: 458 # Create a regular paragraph 459 p = markdown.etree.SubElement(parent, 'p') 460 p.text = block.lstrip() 461