1""" 2Python Markdown 3 4A Python implementation of John Gruber's Markdown. 5 6Documentation: https://python-markdown.github.io/ 7GitHub: https://github.com/Python-Markdown/markdown/ 8PyPI: https://pypi.org/project/Markdown/ 9 10Started by Manfred Stienstra (http://www.dwerg.net/). 11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 12Currently maintained by Waylan Limberg (https://github.com/waylan), 13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 14 15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) 16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 17Copyright 2004 Manfred Stienstra (the original version) 18 19License: BSD (see LICENSE.md for details). 20""" 21 22import codecs 23import sys 24import logging 25import importlib 26from . import util 27from .preprocessors import build_preprocessors 28from .blockprocessors import build_block_parser 29from .treeprocessors import build_treeprocessors 30from .inlinepatterns import build_inlinepatterns 31from .postprocessors import build_postprocessors 32from .extensions import Extension 33from .serializers import to_html_string, to_xhtml_string 34 35__all__ = ['Markdown', 'markdown', 'markdownFromFile'] 36 37 38logger = logging.getLogger('MARKDOWN') 39 40 41class Markdown: 42 """Convert Markdown to HTML.""" 43 44 doc_tag = "div" # Element used to wrap document - later removed 45 46 output_formats = { 47 'html': to_html_string, 48 'xhtml': to_xhtml_string, 49 } 50 51 def __init__(self, **kwargs): 52 """ 53 Creates a new Markdown instance. 54 55 Keyword arguments: 56 57 * extensions: A list of extensions. 58 If an item is an instance of a subclass of `markdown.extension.Extension`, the instance will be used 59 as-is. If an item is of type string, first an entry point will be loaded. If that fails, the string is 60 assumed to use Python dot notation (`path.to.module:ClassName`) to load a markdown.Extension subclass. If 61 no class is specified, then a `makeExtension` function is called within the specified module. 62 * extension_configs: Configuration settings for extensions. 63 * output_format: Format of output. Supported formats are: 64 * "xhtml": Outputs XHTML style tags. Default. 65 * "html": Outputs HTML style tags. 66 * tab_length: Length of tabs in the source. Default: 4 67 68 """ 69 70 self.tab_length = kwargs.get('tab_length', 4) 71 72 self.ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']', 73 '(', ')', '>', '#', '+', '-', '.', '!'] 74 75 self.block_level_elements = [ 76 # Elements which are invalid to wrap in a `<p>` tag. 77 # See https://w3c.github.io/html/grouping-content.html#the-p-element 78 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl', 79 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 80 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol', 81 'p', 'pre', 'section', 'table', 'ul', 82 # Other elements which Markdown should not be mucking up the contents of. 83 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend', 84 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script', 85 'style', 'summary', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video' 86 ] 87 88 self.registeredExtensions = [] 89 self.docType = "" 90 self.stripTopLevelTags = True 91 92 self.build_parser() 93 94 self.references = {} 95 self.htmlStash = util.HtmlStash() 96 self.registerExtensions(extensions=kwargs.get('extensions', []), 97 configs=kwargs.get('extension_configs', {})) 98 self.set_output_format(kwargs.get('output_format', 'xhtml')) 99 self.reset() 100 101 def build_parser(self): 102 """ Build the parser from the various parts. """ 103 self.preprocessors = build_preprocessors(self) 104 self.parser = build_block_parser(self) 105 self.inlinePatterns = build_inlinepatterns(self) 106 self.treeprocessors = build_treeprocessors(self) 107 self.postprocessors = build_postprocessors(self) 108 return self 109 110 def registerExtensions(self, extensions, configs): 111 """ 112 Register extensions with this instance of Markdown. 113 114 Keyword arguments: 115 116 * extensions: A list of extensions, which can either 117 be strings or objects. 118 * configs: A dictionary mapping extension names to config options. 119 120 """ 121 for ext in extensions: 122 if isinstance(ext, str): 123 ext = self.build_extension(ext, configs.get(ext, {})) 124 if isinstance(ext, Extension): 125 ext.extendMarkdown(self) 126 logger.debug( 127 'Successfully loaded extension "%s.%s".' 128 % (ext.__class__.__module__, ext.__class__.__name__) 129 ) 130 elif ext is not None: 131 raise TypeError( 132 'Extension "{}.{}" must be of type: "{}.{}"'.format( 133 ext.__class__.__module__, ext.__class__.__name__, 134 Extension.__module__, Extension.__name__ 135 ) 136 ) 137 return self 138 139 def build_extension(self, ext_name, configs): 140 """ 141 Build extension from a string name, then return an instance. 142 143 First attempt to load an entry point. The string name must be registered as an entry point in the 144 `markdown.extensions` group which points to a subclass of the `markdown.extensions.Extension` class. 145 If multiple distributions have registered the same name, the first one found is returned. 146 147 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and 148 return an instance. If no class is specified, import the module and call a `makeExtension` function and return 149 the Extension instance returned by that function. 150 """ 151 configs = dict(configs) 152 153 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name] 154 if entry_points: 155 ext = entry_points[0].load() 156 return ext(**configs) 157 158 # Get class name (if provided): `path.to.module:ClassName` 159 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '') 160 161 try: 162 module = importlib.import_module(ext_name) 163 logger.debug( 164 'Successfully imported extension module "%s".' % ext_name 165 ) 166 except ImportError as e: 167 message = 'Failed loading extension "%s".' % ext_name 168 e.args = (message,) + e.args[1:] 169 raise 170 171 if class_name: 172 # Load given class name from module. 173 return getattr(module, class_name)(**configs) 174 else: 175 # Expect makeExtension() function to return a class. 176 try: 177 return module.makeExtension(**configs) 178 except AttributeError as e: 179 message = e.args[0] 180 message = "Failed to initiate extension " \ 181 "'%s': %s" % (ext_name, message) 182 e.args = (message,) + e.args[1:] 183 raise 184 185 def registerExtension(self, extension): 186 """ This gets called by the extension """ 187 self.registeredExtensions.append(extension) 188 return self 189 190 def reset(self): 191 """ 192 Resets all state variables so that we can start with a new text. 193 """ 194 self.htmlStash.reset() 195 self.references.clear() 196 197 for extension in self.registeredExtensions: 198 if hasattr(extension, 'reset'): 199 extension.reset() 200 201 return self 202 203 def set_output_format(self, format): 204 """ Set the output format for the class instance. """ 205 self.output_format = format.lower().rstrip('145') # ignore num 206 try: 207 self.serializer = self.output_formats[self.output_format] 208 except KeyError as e: 209 valid_formats = list(self.output_formats.keys()) 210 valid_formats.sort() 211 message = 'Invalid Output Format: "%s". Use one of %s.' \ 212 % (self.output_format, 213 '"' + '", "'.join(valid_formats) + '"') 214 e.args = (message,) + e.args[1:] 215 raise 216 return self 217 218 def is_block_level(self, tag): 219 """Check if the tag is a block level HTML tag.""" 220 if isinstance(tag, str): 221 return tag.lower().rstrip('/') in self.block_level_elements 222 # Some ElementTree tags are not strings, so return False. 223 return False 224 225 def convert(self, source): 226 """ 227 Convert markdown to serialized XHTML or HTML. 228 229 Keyword arguments: 230 231 * source: Source text as a Unicode string. 232 233 Markdown processing takes place in five steps: 234 235 1. A bunch of "preprocessors" munge the input text. 236 2. BlockParser() parses the high-level structural elements of the 237 pre-processed text into an ElementTree. 238 3. A bunch of "treeprocessors" are run against the ElementTree. One 239 such treeprocessor runs InlinePatterns against the ElementTree, 240 detecting inline markup. 241 4. Some post-processors are run against the text after the ElementTree 242 has been serialized into text. 243 5. The output is written to a string. 244 245 """ 246 247 # Fixup the source text 248 if not source.strip(): 249 return '' # a blank unicode string 250 251 try: 252 source = str(source) 253 except UnicodeDecodeError as e: # pragma: no cover 254 # Customise error message while maintaining original trackback 255 e.reason += '. -- Note: Markdown only accepts unicode input!' 256 raise 257 258 # Split into lines and run the line preprocessors. 259 self.lines = source.split("\n") 260 for prep in self.preprocessors: 261 self.lines = prep.run(self.lines) 262 263 # Parse the high-level elements. 264 root = self.parser.parseDocument(self.lines).getroot() 265 266 # Run the tree-processors 267 for treeprocessor in self.treeprocessors: 268 newRoot = treeprocessor.run(root) 269 if newRoot is not None: 270 root = newRoot 271 272 # Serialize _properly_. Strip top-level tags. 273 output = self.serializer(root) 274 if self.stripTopLevelTags: 275 try: 276 start = output.index( 277 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2 278 end = output.rindex('</%s>' % self.doc_tag) 279 output = output[start:end].strip() 280 except ValueError as e: # pragma: no cover 281 if output.strip().endswith('<%s />' % self.doc_tag): 282 # We have an empty document 283 output = '' 284 else: 285 # We have a serious problem 286 raise ValueError('Markdown failed to strip top-level ' 287 'tags. Document=%r' % output.strip()) from e 288 289 # Run the text post-processors 290 for pp in self.postprocessors: 291 output = pp.run(output) 292 293 return output.strip() 294 295 def convertFile(self, input=None, output=None, encoding=None): 296 """Converts a markdown file and returns the HTML as a unicode string. 297 298 Decodes the file using the provided encoding (defaults to utf-8), 299 passes the file content to markdown, and outputs the html to either 300 the provided stream or the file with provided name, using the same 301 encoding as the source file. The 'xmlcharrefreplace' error handler is 302 used when encoding the output. 303 304 **Note:** This is the only place that decoding and encoding of unicode 305 takes place in Python-Markdown. (All other code is unicode-in / 306 unicode-out.) 307 308 Keyword arguments: 309 310 * input: File object or path. Reads from stdin if `None`. 311 * output: File object or path. Writes to stdout if `None`. 312 * encoding: Encoding of input and output files. Defaults to utf-8. 313 314 """ 315 316 encoding = encoding or "utf-8" 317 318 # Read the source 319 if input: 320 if isinstance(input, str): 321 input_file = codecs.open(input, mode="r", encoding=encoding) 322 else: 323 input_file = codecs.getreader(encoding)(input) 324 text = input_file.read() 325 input_file.close() 326 else: 327 text = sys.stdin.read() 328 if not isinstance(text, str): # pragma: no cover 329 text = text.decode(encoding) 330 331 text = text.lstrip('\ufeff') # remove the byte-order mark 332 333 # Convert 334 html = self.convert(text) 335 336 # Write to file or stdout 337 if output: 338 if isinstance(output, str): 339 output_file = codecs.open(output, "w", 340 encoding=encoding, 341 errors="xmlcharrefreplace") 342 output_file.write(html) 343 output_file.close() 344 else: 345 writer = codecs.getwriter(encoding) 346 output_file = writer(output, errors="xmlcharrefreplace") 347 output_file.write(html) 348 # Don't close here. User may want to write more. 349 else: 350 # Encode manually and write bytes to stdout. 351 html = html.encode(encoding, "xmlcharrefreplace") 352 try: 353 # Write bytes directly to buffer (Python 3). 354 sys.stdout.buffer.write(html) 355 except AttributeError: # pragma: no cover 356 # Probably Python 2, which works with bytes by default. 357 sys.stdout.write(html) 358 359 return self 360 361 362""" 363EXPORTED FUNCTIONS 364============================================================================= 365 366Those are the two functions we really mean to export: markdown() and 367markdownFromFile(). 368""" 369 370 371def markdown(text, **kwargs): 372 """Convert a markdown string to HTML and return HTML as a unicode string. 373 374 This is a shortcut function for `Markdown` class to cover the most 375 basic use case. It initializes an instance of Markdown, loads the 376 necessary extensions and runs the parser on the given text. 377 378 Keyword arguments: 379 380 * text: Markdown formatted text as Unicode or ASCII string. 381 * Any arguments accepted by the Markdown class. 382 383 Returns: An HTML document as a string. 384 385 """ 386 md = Markdown(**kwargs) 387 return md.convert(text) 388 389 390def markdownFromFile(**kwargs): 391 """Read markdown code from a file and write it to a file or a stream. 392 393 This is a shortcut function which initializes an instance of Markdown, 394 and calls the convertFile method rather than convert. 395 396 Keyword arguments: 397 398 * input: a file name or readable object. 399 * output: a file name or writable object. 400 * encoding: Encoding of input and output. 401 * Any arguments accepted by the Markdown class. 402 403 """ 404 md = Markdown(**kwargs) 405 md.convertFile(kwargs.get('input', None), 406 kwargs.get('output', None), 407 kwargs.get('encoding', None)) 408