1# markdown is released under the BSD license 2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) 3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 4# Copyright 2004 Manfred Stienstra (the original version) 5# 6# All rights reserved. 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions are met: 10# 11# * Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# * Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# * Neither the name of the <organization> nor the 17# names of its contributors may be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY 21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT 24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30# POSSIBILITY OF SUCH DAMAGE. 31 32 33""" 34Python Markdown 35=============== 36 37Python Markdown converts Markdown to HTML and can be used as a library or 38called from the command line. 39 40## Basic usage as a module: 41 42 import markdown 43 html = markdown.markdown(your_text_string) 44 45See <http://packages.python.org/Markdown/> for more 46information and instructions on how to extend the functionality of 47Python Markdown. Read that before you try modifying this file. 48 49## Authors and License 50 51Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and 52maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan 53Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). 54 55Contact: markdown@freewisdom.org 56 57Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later) 58Copyright 200? Django Software Foundation (OrderedDict implementation) 59Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 60Copyright 2004 Manfred Stienstra (the original version) 61 62License: BSD (see LICENSE for details). 63""" 64 65from __future__ import absolute_import 66from __future__ import unicode_literals 67from .__version__ import version, version_info 68import re 69import codecs 70import sys 71import logging 72from . import util 73from .preprocessors import build_preprocessors 74from .blockprocessors import build_block_parser 75from .treeprocessors import build_treeprocessors 76from .inlinepatterns import build_inlinepatterns 77from .postprocessors import build_postprocessors 78from .extensions import Extension 79from .serializers import to_html_string, to_xhtml_string 80 81__all__ = ['Markdown', 'markdown', 'markdownFromFile'] 82 83logger = logging.getLogger('MARKDOWN') 84 85 86class Markdown(object): 87 """Convert Markdown to HTML.""" 88 89 doc_tag = "div" # Element used to wrap document - later removed 90 91 option_defaults = { 92 'html_replacement_text' : '[HTML_REMOVED]', 93 'tab_length' : 4, 94 'enable_attributes' : True, 95 'smart_emphasis' : True, 96 'lazy_ol' : True, 97 } 98 99 output_formats = { 100 'html' : to_html_string, 101 'html4' : to_html_string, 102 'html5' : to_html_string, 103 'xhtml' : to_xhtml_string, 104 'xhtml1': to_xhtml_string, 105 'xhtml5': to_xhtml_string, 106 } 107 108 ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']', 109 '(', ')', '>', '#', '+', '-', '.', '!'] 110 111 def __init__(self, *args, **kwargs): 112 """ 113 Creates a new Markdown instance. 114 115 Keyword arguments: 116 117 * extensions: A list of extensions. 118 If they are of type string, the module mdx_name.py will be loaded. 119 If they are a subclass of markdown.Extension, they will be used 120 as-is. 121 * extension_configs: Configuration settingis for extensions. 122 * output_format: Format of output. Supported formats are: 123 * "xhtml1": Outputs XHTML 1.x. Default. 124 * "xhtml5": Outputs XHTML style tags of HTML 5 125 * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). 126 * "html4": Outputs HTML 4 127 * "html5": Outputs HTML style tags of HTML 5 128 * "html": Outputs latest supported version of HTML (currently HTML 4). 129 Note that it is suggested that the more specific formats ("xhtml1" 130 and "html4") be used as "xhtml" or "html" may change in the future 131 if it makes sense at that time. 132 * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". 133 * html_replacement_text: Text used when safe_mode is set to "replace". 134 * tab_length: Length of tabs in the source. Default: 4 135 * enable_attributes: Enable the conversion of attributes. Default: True 136 * smart_emphasis: Treat `_connected_words_` intelegently Default: True 137 * lazy_ol: Ignore number of first item of ordered lists. Default: True 138 139 """ 140 141 # For backward compatibility, loop through old positional args 142 pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format'] 143 c = 0 144 for arg in args: 145 if pos[c] not in kwargs: 146 kwargs[pos[c]] = arg 147 c += 1 148 if c == len(pos): 149 # ignore any additional args 150 break 151 152 # Loop through kwargs and assign defaults 153 for option, default in self.option_defaults.items(): 154 setattr(self, option, kwargs.get(option, default)) 155 156 self.safeMode = kwargs.get('safe_mode', False) 157 if self.safeMode and 'enable_attributes' not in kwargs: 158 # Disable attributes in safeMode when not explicitly set 159 self.enable_attributes = False 160 161 self.registeredExtensions = [] 162 self.docType = "" 163 self.stripTopLevelTags = True 164 165 self.build_parser() 166 167 self.references = {} 168 self.htmlStash = util.HtmlStash() 169 self.set_output_format(kwargs.get('output_format', 'xhtml1')) 170 self.registerExtensions(extensions=kwargs.get('extensions', []), 171 configs=kwargs.get('extension_configs', {})) 172 self.reset() 173 174 def build_parser(self): 175 """ Build the parser from the various parts. """ 176 self.preprocessors = build_preprocessors(self) 177 self.parser = build_block_parser(self) 178 self.inlinePatterns = build_inlinepatterns(self) 179 self.treeprocessors = build_treeprocessors(self) 180 self.postprocessors = build_postprocessors(self) 181 return self 182 183 def registerExtensions(self, extensions, configs): 184 """ 185 Register extensions with this instance of Markdown. 186 187 Keyword arguments: 188 189 * extensions: A list of extensions, which can either 190 be strings or objects. See the docstring on Markdown. 191 * configs: A dictionary mapping module names to config options. 192 193 """ 194 for ext in extensions: 195 if isinstance(ext, util.string_type): 196 ext = self.build_extension(ext, configs.get(ext, [])) 197 if isinstance(ext, Extension): 198 ext.extendMarkdown(self, globals()) 199 elif ext is not None: 200 raise TypeError( 201 'Extension "%s.%s" must be of type: "markdown.Extension"' 202 % (ext.__class__.__module__, ext.__class__.__name__)) 203 204 return self 205 206 def build_extension(self, ext_name, configs = []): 207 """Build extension by name, then return the module. 208 209 The extension name may contain arguments as part of the string in the 210 following format: "extname(key1=value1,key2=value2)" 211 212 """ 213 214 # Parse extensions config params (ignore the order) 215 configs = dict(configs) 216 pos = ext_name.find("(") # find the first "(" 217 if pos > 0: 218 ext_args = ext_name[pos+1:-1] 219 ext_name = ext_name[:pos] 220 pairs = [x.split("=") for x in ext_args.split(",")] 221 configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) 222 223 # Setup the module name 224 module_name = ext_name 225 if '.' not in ext_name: 226 module_name = '.'.join(['third_party.markdown.extensions', ext_name]) 227 228 # Try loading the extension first from one place, then another 229 try: # New style (markdown.extensons.<extension>) 230 module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]]) 231 except ImportError: 232 module_name_old_style = '_'.join(['mdx', ext_name]) 233 try: # Old style (mdx_<extension>) 234 module = __import__(module_name_old_style) 235 except ImportError as e: 236 message = "Failed loading extension '%s' from '%s' or '%s'" \ 237 % (ext_name, module_name, module_name_old_style) 238 e.args = (message,) + e.args[1:] 239 raise 240 241 # If the module is loaded successfully, we expect it to define a 242 # function called makeExtension() 243 try: 244 return module.makeExtension(configs.items()) 245 except AttributeError as e: 246 message = e.args[0] 247 message = "Failed to initiate extension " \ 248 "'%s': %s" % (ext_name, message) 249 e.args = (message,) + e.args[1:] 250 raise 251 252 def registerExtension(self, extension): 253 """ This gets called by the extension """ 254 self.registeredExtensions.append(extension) 255 return self 256 257 def reset(self): 258 """ 259 Resets all state variables so that we can start with a new text. 260 """ 261 self.htmlStash.reset() 262 self.references.clear() 263 264 for extension in self.registeredExtensions: 265 if hasattr(extension, 'reset'): 266 extension.reset() 267 268 return self 269 270 def set_output_format(self, format): 271 """ Set the output format for the class instance. """ 272 self.output_format = format.lower() 273 try: 274 self.serializer = self.output_formats[self.output_format] 275 except KeyError as e: 276 valid_formats = list(self.output_formats.keys()) 277 valid_formats.sort() 278 message = 'Invalid Output Format: "%s". Use one of %s.' \ 279 % (self.output_format, 280 '"' + '", "'.join(valid_formats) + '"') 281 e.args = (message,) + e.args[1:] 282 raise 283 return self 284 285 def convert(self, source): 286 """ 287 Convert markdown to serialized XHTML or HTML. 288 289 Keyword arguments: 290 291 * source: Source text as a Unicode string. 292 293 Markdown processing takes place in five steps: 294 295 1. A bunch of "preprocessors" munge the input text. 296 2. BlockParser() parses the high-level structural elements of the 297 pre-processed text into an ElementTree. 298 3. A bunch of "treeprocessors" are run against the ElementTree. One 299 such treeprocessor runs InlinePatterns against the ElementTree, 300 detecting inline markup. 301 4. Some post-processors are run against the text after the ElementTree 302 has been serialized into text. 303 5. The output is written to a string. 304 305 """ 306 307 # Fixup the source text 308 if not source.strip(): 309 return '' # a blank unicode string 310 311 try: 312 source = util.text_type(source) 313 except UnicodeDecodeError as e: 314 # Customise error message while maintaining original trackback 315 e.reason += '. -- Note: Markdown only accepts unicode input!' 316 raise 317 318 # Split into lines and run the line preprocessors. 319 self.lines = source.split("\n") 320 for prep in self.preprocessors.values(): 321 self.lines = prep.run(self.lines) 322 323 # Parse the high-level elements. 324 root = self.parser.parseDocument(self.lines).getroot() 325 326 # Run the tree-processors 327 for treeprocessor in self.treeprocessors.values(): 328 newRoot = treeprocessor.run(root) 329 if newRoot: 330 root = newRoot 331 332 # Serialize _properly_. Strip top-level tags. 333 output = self.serializer(root) 334 if self.stripTopLevelTags: 335 try: 336 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2 337 end = output.rindex('</%s>'%self.doc_tag) 338 output = output[start:end].strip() 339 except ValueError: 340 if output.strip().endswith('<%s />'%self.doc_tag): 341 # We have an empty document 342 output = '' 343 else: 344 # We have a serious problem 345 raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip()) 346 347 # Run the text post-processors 348 for pp in self.postprocessors.values(): 349 output = pp.run(output) 350 351 return output.strip() 352 353 def convertFile(self, input=None, output=None, encoding=None): 354 """Converts a markdown file and returns the HTML as a unicode string. 355 356 Decodes the file using the provided encoding (defaults to utf-8), 357 passes the file content to markdown, and outputs the html to either 358 the provided stream or the file with provided name, using the same 359 encoding as the source file. The 'xmlcharrefreplace' error handler is 360 used when encoding the output. 361 362 **Note:** This is the only place that decoding and encoding of unicode 363 takes place in Python-Markdown. (All other code is unicode-in / 364 unicode-out.) 365 366 Keyword arguments: 367 368 * input: File object or path. Reads from stdin if `None`. 369 * output: File object or path. Writes to stdout if `None`. 370 * encoding: Encoding of input and output files. Defaults to utf-8. 371 372 """ 373 374 encoding = encoding or "utf-8" 375 376 # Read the source 377 if input: 378 if isinstance(input, util.string_type): 379 input_file = codecs.open(input, mode="r", encoding=encoding) 380 else: 381 input_file = codecs.getreader(encoding)(input) 382 text = input_file.read() 383 input_file.close() 384 else: 385 text = sys.stdin.read() 386 if not isinstance(text, util.text_type): 387 text = text.decode(encoding) 388 389 text = text.lstrip('\ufeff') # remove the byte-order mark 390 391 # Convert 392 html = self.convert(text) 393 394 # Write to file or stdout 395 if output: 396 if isinstance(output, util.string_type): 397 output_file = codecs.open(output, "w", 398 encoding=encoding, 399 errors="xmlcharrefreplace") 400 output_file.write(html) 401 output_file.close() 402 else: 403 writer = codecs.getwriter(encoding) 404 output_file = writer(output, errors="xmlcharrefreplace") 405 output_file.write(html) 406 # Don't close here. User may want to write more. 407 else: 408 # Encode manually and write bytes to stdout. 409 html = html.encode(encoding, "xmlcharrefreplace") 410 try: 411 # Write bytes directly to buffer (Python 3). 412 sys.stdout.buffer.write(html) 413 except AttributeError: 414 # Probably Python 2, which works with bytes by default. 415 sys.stdout.write(html) 416 417 return self 418 419 420""" 421EXPORTED FUNCTIONS 422============================================================================= 423 424Those are the two functions we really mean to export: markdown() and 425markdownFromFile(). 426""" 427 428def markdown(text, *args, **kwargs): 429 """Convert a markdown string to HTML and return HTML as a unicode string. 430 431 This is a shortcut function for `Markdown` class to cover the most 432 basic use case. It initializes an instance of Markdown, loads the 433 necessary extensions and runs the parser on the given text. 434 435 Keyword arguments: 436 437 * text: Markdown formatted text as Unicode or ASCII string. 438 * Any arguments accepted by the Markdown class. 439 440 Returns: An HTML document as a string. 441 442 """ 443 md = Markdown(*args, **kwargs) 444 return md.convert(text) 445 446 447def markdownFromFile(*args, **kwargs): 448 """Read markdown code from a file and write it to a file or a stream. 449 450 This is a shortcut function which initializes an instance of Markdown, 451 and calls the convertFile method rather than convert. 452 453 Keyword arguments: 454 455 * input: a file name or readable object. 456 * output: a file name or writable object. 457 * encoding: Encoding of input and output. 458 * Any arguments accepted by the Markdown class. 459 460 """ 461 # For backward compatibility loop through positional args 462 pos = ['input', 'output', 'extensions', 'encoding'] 463 c = 0 464 for arg in args: 465 if pos[c] not in kwargs: 466 kwargs[pos[c]] = arg 467 c += 1 468 if c == len(pos): 469 break 470 471 md = Markdown(**kwargs) 472 md.convertFile(kwargs.get('input', None), 473 kwargs.get('output', None), 474 kwargs.get('encoding', None)) 475 476