1 2""" 3PRE-PROCESSORS 4============================================================================= 5 6Preprocessors work on source text before we start doing anything too 7complicated. 8""" 9 10import re 11import markdown 12 13HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" 14HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX 15 16class Processor: 17 def __init__(self, markdown_instance=None): 18 if markdown_instance: 19 self.markdown = markdown_instance 20 21class Preprocessor (Processor): 22 """ 23 Preprocessors are run after the text is broken into lines. 24 25 Each preprocessor implements a "run" method that takes a pointer to a 26 list of lines of the document, modifies it as necessary and returns 27 either the same pointer or a pointer to a new list. 28 29 Preprocessors must extend markdown.Preprocessor. 30 31 """ 32 def run(self, lines): 33 """ 34 Each subclass of Preprocessor should override the `run` method, which 35 takes the document as a list of strings split by newlines and returns 36 the (possibly modified) list of lines. 37 38 """ 39 pass 40 41class HtmlStash: 42 """ 43 This class is used for stashing HTML objects that we extract 44 in the beginning and replace with place-holders. 45 """ 46 47 def __init__ (self): 48 """ Create a HtmlStash. """ 49 self.html_counter = 0 # for counting inline html segments 50 self.rawHtmlBlocks=[] 51 52 def store(self, html, safe=False): 53 """ 54 Saves an HTML segment for later reinsertion. Returns a 55 placeholder string that needs to be inserted into the 56 document. 57 58 Keyword arguments: 59 60 * html: an html segment 61 * safe: label an html segment as safe for safemode 62 63 Returns : a placeholder string 64 65 """ 66 self.rawHtmlBlocks.append((html, safe)) 67 placeholder = HTML_PLACEHOLDER % self.html_counter 68 self.html_counter += 1 69 return placeholder 70 71 def reset(self): 72 self.html_counter = 0 73 self.rawHtmlBlocks = [] 74 75 76class HtmlBlockPreprocessor(Preprocessor): 77 """Remove html blocks from the text and store them for later retrieval.""" 78 79 right_tag_patterns = ["</%s>", "%s>"] 80 81 def _get_left_tag(self, block): 82 return block[1:].replace(">", " ", 1).split()[0].lower() 83 84 def _get_right_tag(self, left_tag, block): 85 for p in self.right_tag_patterns: 86 tag = p % left_tag 87 i = block.rfind(tag) 88 if i > 2: 89 return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) 90 return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) 91 92 def _equal_tags(self, left_tag, right_tag): 93 if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. 94 return True 95 if ("/" + left_tag) == right_tag: 96 return True 97 if (right_tag == "--" and left_tag == "--"): 98 return True 99 elif left_tag == right_tag[1:] \ 100 and right_tag[0] != "<": 101 return True 102 else: 103 return False 104 105 def _is_oneliner(self, tag): 106 return (tag in ['hr', 'hr/']) 107 108 def run(self, lines): 109 text = "\n".join(lines) 110 new_blocks = [] 111 text = text.split("\n\n") 112 items = [] 113 left_tag = '' 114 right_tag = '' 115 in_tag = False # flag 116 117 while text: 118 block = text[0] 119 if block.startswith("\n"): 120 block = block[1:] 121 text = text[1:] 122 123 if block.startswith("\n"): 124 block = block[1:] 125 126 if not in_tag: 127 if block.startswith("<"): 128 left_tag = self._get_left_tag(block) 129 right_tag, data_index = self._get_right_tag(left_tag, block) 130 131 if block[1] == "!": 132 # is a comment block 133 left_tag = "--" 134 right_tag, data_index = self._get_right_tag(left_tag, block) 135 # keep checking conditions below and maybe just append 136 137 if data_index < len(block) \ 138 and markdown.isBlockLevel(left_tag): 139 text.insert(0, block[data_index:]) 140 block = block[:data_index] 141 142 if not (markdown.isBlockLevel(left_tag) \ 143 or block[1] in ["!", "?", "@", "%"]): 144 new_blocks.append(block) 145 continue 146 147 if self._is_oneliner(left_tag): 148 new_blocks.append(block.strip()) 149 continue 150 151 if block.rstrip().endswith(">") \ 152 and self._equal_tags(left_tag, right_tag): 153 new_blocks.append( 154 self.markdown.htmlStash.store(block.strip())) 155 continue 156 else: #if not block[1] == "!": 157 # if is block level tag and is not complete 158 159 if markdown.isBlockLevel(left_tag) or left_tag == "--" \ 160 and not block.rstrip().endswith(">"): 161 items.append(block.strip()) 162 in_tag = True 163 else: 164 new_blocks.append( 165 self.markdown.htmlStash.store(block.strip())) 166 167 continue 168 169 new_blocks.append(block) 170 171 else: 172 items.append(block.strip()) 173 174 right_tag, data_index = self._get_right_tag(left_tag, block) 175 176 if self._equal_tags(left_tag, right_tag): 177 # if find closing tag 178 in_tag = False 179 new_blocks.append( 180 self.markdown.htmlStash.store('\n\n'.join(items))) 181 items = [] 182 183 if items: 184 new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) 185 new_blocks.append('\n') 186 187 new_text = "\n\n".join(new_blocks) 188 return new_text.split("\n") 189 190 191class ReferencePreprocessor(Preprocessor): 192 """ Remove reference definitions from text and store for later use. """ 193 194 RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) 195 196 def run (self, lines): 197 new_text = []; 198 for line in lines: 199 m = self.RE.match(line) 200 if m: 201 id = m.group(2).strip().lower() 202 t = m.group(4).strip() # potential title 203 if not t: 204 self.markdown.references[id] = (m.group(3), t) 205 elif (len(t) >= 2 206 and (t[0] == t[-1] == "\"" 207 or t[0] == t[-1] == "\'" 208 or (t[0] == "(" and t[-1] == ")") ) ): 209 self.markdown.references[id] = (m.group(3), t[1:-1]) 210 else: 211 new_text.append(line) 212 else: 213 new_text.append(line) 214 215 return new_text #+ "\n" 216