1""" 2Python Markdown 3 4A Python implementation of John Gruber's Markdown. 5 6Documentation: https://python-markdown.github.io/ 7GitHub: https://github.com/Python-Markdown/markdown/ 8PyPI: https://pypi.org/project/Markdown/ 9 10Started by Manfred Stienstra (http://www.dwerg.net/). 11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 12Currently maintained by Waylan Limberg (https://github.com/waylan), 13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 14 15Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later) 16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 17Copyright 2004 Manfred Stienstra (the original version) 18 19License: BSD (see LICENSE.md for details). 20""" 21 22import re 23import importlib.util 24import sys 25 26 27# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. 28# Users can still do `from html import parser` and get the default behavior. 29spec = importlib.util.find_spec('html.parser') 30htmlparser = importlib.util.module_from_spec(spec) 31spec.loader.exec_module(htmlparser) 32sys.modules['htmlparser'] = htmlparser 33 34# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. 35htmlparser.piclose = re.compile(r'\?>') 36# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon. 37htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') 38# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block, 39# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete, 40# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. 41htmlparser.incomplete = htmlparser.entityref 42# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value. 43htmlparser.locatestarttagend_tolerant = re.compile(r""" 44 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here 45 (?:[\s/]* # optional whitespace before attribute name 46 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here 47 (?:\s*=+\s* # value indicator 48 (?:'[^']*' # LITA-enclosed value 49 |"[^"]*" # LIT-enclosed value 50 |(?!['"])[^`>\s]* # bare value <= added backtick here 51 ) 52 (?:\s*,)* # possibly followed by a comma 53 )?(?:\s|/(?!>))* 54 )* 55 )? 56 \s* # trailing whitespace 57""", re.VERBOSE) 58 59# Match a blank line at the start of a block of text (two newlines). 60# The newlines may be preceded by additional whitespace. 61blank_line_re = re.compile(r'^([ ]*\n){2}') 62 63 64class HTMLExtractor(htmlparser.HTMLParser): 65 """ 66 Extract raw HTML from text. 67 68 The raw HTML is stored in the `htmlStash` of the Markdown instance passed 69 to `md` and the remaining text is stored in `cleandoc` as a list of strings. 70 """ 71 72 def __init__(self, md, *args, **kwargs): 73 if 'convert_charrefs' not in kwargs: 74 kwargs['convert_charrefs'] = False 75 76 # Block tags that should contain no content (self closing) 77 self.empty_tags = set(['hr']) 78 79 # This calls self.reset 80 super().__init__(*args, **kwargs) 81 self.md = md 82 83 def reset(self): 84 """Reset this instance. Loses all unprocessed data.""" 85 self.inraw = False 86 self.intail = False 87 self.stack = [] # When inraw==True, stack contains a list of tags 88 self._cache = [] 89 self.cleandoc = [] 90 super().reset() 91 92 def close(self): 93 """Handle any buffered data.""" 94 super().close() 95 if len(self.rawdata): 96 # Temp fix for https://bugs.python.org/issue41989 97 # TODO: remove this when the bug is fixed in all supported Python versions. 98 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover 99 self.handle_data(htmlparser.unescape(self.rawdata)) 100 else: 101 self.handle_data(self.rawdata) 102 # Handle any unclosed tags. 103 if len(self._cache): 104 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 105 self._cache = [] 106 107 @property 108 def line_offset(self): 109 """Returns char index in self.rawdata for the start of the current line. """ 110 if self.lineno > 1 and '\n' in self.rawdata: 111 m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata) 112 if m: 113 return m.end() 114 else: # pragma: no cover 115 # Value of self.lineno must exceed total number of lines. 116 # Find index of beginning of last line. 117 return self.rawdata.rfind('\n') 118 return 0 119 120 def at_line_start(self): 121 """ 122 Returns True if current position is at start of line. 123 124 Allows for up to three blank spaces at start of line. 125 """ 126 if self.offset == 0: 127 return True 128 if self.offset > 3: 129 return False 130 # Confirm up to first 3 chars are whitespace 131 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' 132 133 def get_endtag_text(self, tag): 134 """ 135 Returns the text of the end tag. 136 137 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. 138 """ 139 # Attempt to extract actual tag from raw source text 140 start = self.line_offset + self.offset 141 m = htmlparser.endendtag.search(self.rawdata, start) 142 if m: 143 return self.rawdata[start:m.end()] 144 else: # pragma: no cover 145 # Failed to extract from raw data. Assume well formed and lowercase. 146 return '</{}>'.format(tag) 147 148 def handle_starttag(self, tag, attrs): 149 # Handle tags that should always be empty and do not specify a closing tag 150 if tag in self.empty_tags: 151 self.handle_startendtag(tag, attrs) 152 return 153 154 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): 155 # Started a new raw block. Prepare stack. 156 self.inraw = True 157 self.cleandoc.append('\n') 158 159 text = self.get_starttag_text() 160 if self.inraw: 161 self.stack.append(tag) 162 self._cache.append(text) 163 else: 164 self.cleandoc.append(text) 165 if tag in self.CDATA_CONTENT_ELEMENTS: 166 # This is presumably a standalone tag in a code span (see #1036). 167 self.clear_cdata_mode() 168 169 def handle_endtag(self, tag): 170 text = self.get_endtag_text(tag) 171 172 if self.inraw: 173 self._cache.append(text) 174 if tag in self.stack: 175 # Remove tag from stack 176 while self.stack: 177 if self.stack.pop() == tag: 178 break 179 if len(self.stack) == 0: 180 # End of raw block. 181 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): 182 # Preserve blank line and end of raw block. 183 self._cache.append('\n') 184 else: 185 # More content exists after endtag. 186 self.intail = True 187 # Reset stack. 188 self.inraw = False 189 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) 190 # Insert blank line between this and next line. 191 self.cleandoc.append('\n\n') 192 self._cache = [] 193 else: 194 self.cleandoc.append(text) 195 196 def handle_data(self, data): 197 if self.intail and '\n' in data: 198 self.intail = False 199 if self.inraw: 200 self._cache.append(data) 201 else: 202 self.cleandoc.append(data) 203 204 def handle_empty_tag(self, data, is_block): 205 """ Handle empty tags (`<data>`). """ 206 if self.inraw or self.intail: 207 # Append this to the existing raw block 208 self._cache.append(data) 209 elif self.at_line_start() and is_block: 210 # Handle this as a standalone raw block 211 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): 212 # Preserve blank line after tag in raw block. 213 data += '\n' 214 else: 215 # More content exists after tag. 216 self.intail = True 217 item = self.cleandoc[-1] if self.cleandoc else '' 218 # If we only have one newline before block element, add another 219 if not item.endswith('\n\n') and item.endswith('\n'): 220 self.cleandoc.append('\n') 221 self.cleandoc.append(self.md.htmlStash.store(data)) 222 # Insert blank line between this and next line. 223 self.cleandoc.append('\n\n') 224 else: 225 self.cleandoc.append(data) 226 227 def handle_startendtag(self, tag, attrs): 228 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) 229 230 def handle_charref(self, name): 231 self.handle_empty_tag('&#{};'.format(name), is_block=False) 232 233 def handle_entityref(self, name): 234 self.handle_empty_tag('&{};'.format(name), is_block=False) 235 236 def handle_comment(self, data): 237 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) 238 239 def handle_decl(self, data): 240 self.handle_empty_tag('<!{}>'.format(data), is_block=True) 241 242 def handle_pi(self, data): 243 self.handle_empty_tag('<?{}?>'.format(data), is_block=True) 244 245 def unknown_decl(self, data): 246 end = ']]>' if data.startswith('CDATA[') else ']>' 247 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) 248 249 def parse_pi(self, i): 250 if self.at_line_start() or self.intail: 251 return super().parse_pi(i) 252 # This is not the beginning of a raw block so treat as plain data 253 # and avoid consuming any tags which may follow (see #1066). 254 self.handle_data('<?') 255 return i + 2 256 257 def parse_html_declaration(self, i): 258 if self.at_line_start() or self.intail: 259 return super().parse_html_declaration(i) 260 # This is not the beginning of a raw block so treat as plain data 261 # and avoid consuming any tags which may follow (see #1066). 262 self.handle_data('<!') 263 return i + 2 264 265 # The rest has been copied from base class in standard lib to address #1036. 266 # As __startag_text is private, all references to it must be in this subclass. 267 # The last few lines of parse_starttag are reversed so that handle_starttag 268 # can override cdata_mode in certain situations (in a code span). 269 __starttag_text = None 270 271 def get_starttag_text(self): 272 """Return full source of start tag: '<...>'.""" 273 return self.__starttag_text 274 275 def parse_starttag(self, i): # pragma: no cover 276 self.__starttag_text = None 277 endpos = self.check_for_whole_start_tag(i) 278 if endpos < 0: 279 return endpos 280 rawdata = self.rawdata 281 self.__starttag_text = rawdata[i:endpos] 282 283 # Now parse the data between i+1 and j into a tag and attrs 284 attrs = [] 285 match = htmlparser.tagfind_tolerant.match(rawdata, i+1) 286 assert match, 'unexpected call to parse_starttag()' 287 k = match.end() 288 self.lasttag = tag = match.group(1).lower() 289 while k < endpos: 290 m = htmlparser.attrfind_tolerant.match(rawdata, k) 291 if not m: 292 break 293 attrname, rest, attrvalue = m.group(1, 2, 3) 294 if not rest: 295 attrvalue = None 296 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 297 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 298 attrvalue = attrvalue[1:-1] 299 if attrvalue: 300 attrvalue = htmlparser.unescape(attrvalue) 301 attrs.append((attrname.lower(), attrvalue)) 302 k = m.end() 303 304 end = rawdata[k:endpos].strip() 305 if end not in (">", "/>"): 306 lineno, offset = self.getpos() 307 if "\n" in self.__starttag_text: 308 lineno = lineno + self.__starttag_text.count("\n") 309 offset = len(self.__starttag_text) \ 310 - self.__starttag_text.rfind("\n") # noqa: E127 311 else: 312 offset = offset + len(self.__starttag_text) 313 self.handle_data(rawdata[i:endpos]) 314 return endpos 315 if end.endswith('/>'): 316 # XHTML-style empty tag: <span attr="value" /> 317 self.handle_startendtag(tag, attrs) 318 else: 319 # *** set cdata_mode first so we can override it in handle_starttag (see #1036) *** 320 if tag in self.CDATA_CONTENT_ELEMENTS: 321 self.set_cdata_mode(tag) 322 self.handle_starttag(tag, attrs) 323 return endpos 324