• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Python Markdown
3===============
4
5Python Markdown converts Markdown to HTML and can be used as a library or
6called from the command line.
7
8## Basic usage as a module:
9
10    import markdown
11    md = Markdown()
12    html = md.convert(your_text_string)
13
14## Basic use from the command line:
15
16    markdown source.txt > destination.html
17
18Run "markdown --help" to see more options.
19
20## Extensions
21
22See <http://www.freewisdom.org/projects/python-markdown/> for more
23information and instructions on how to extend the functionality of
24Python Markdown.  Read that before you try modifying this file.
25
26## Authors and License
27
28Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
29maintained  by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
30Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
31
32Contact: markdown@freewisdom.org
33
34Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
35Copyright 200? Django Software Foundation (OrderedDict implementation)
36Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
37Copyright 2004 Manfred Stienstra (the original version)
38
39License: BSD (see docs/LICENSE for details).
40"""
41
42version = "2.0.3"
43version_info = (2,0,3, "Final")
44
45import re
46import codecs
47import sys
48import warnings
49import logging
50from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
51
52
53"""
54CONSTANTS
55=============================================================================
56"""
57
58"""
59Constants you might want to modify
60-----------------------------------------------------------------------------
61"""
62
63# default logging level for command-line use
64COMMAND_LINE_LOGGING_LEVEL = CRITICAL
65TAB_LENGTH = 4               # expand tabs to this many spaces
66ENABLE_ATTRIBUTES = True     # @id = xyz -> <... id="xyz">
67SMART_EMPHASIS = True        # this_or_that does not become this<i>or</i>that
68DEFAULT_OUTPUT_FORMAT = 'xhtml1'     # xhtml or html4 output
69HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
70BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
71                                  "|script|noscript|form|fieldset|iframe|math"
72                                  "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody"
73                                  "|tr|th|td")
74DOC_TAG = "div"     # Element used to wrap document - later removed
75
76# Placeholders
77STX = u'\u0002'  # Use STX ("Start of text") for start-of-placeholder
78ETX = u'\u0003'  # Use ETX ("End of text") for end-of-placeholder
79INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
80INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
81AMP_SUBSTITUTE = STX+"amp"+ETX
82
83
84"""
85Constants you probably do not need to change
86-----------------------------------------------------------------------------
87"""
88
89RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
90                     # Hebrew (0590-05FF), Arabic (0600-06FF),
91                     # Syriac (0700-074F), Arabic supplement (0750-077F),
92                     # Thaana (0780-07BF), Nko (07C0-07FF).
93                    (u'\u2D30', u'\u2D7F'), # Tifinagh
94                    )
95
96
97"""
98AUXILIARY GLOBAL FUNCTIONS
99=============================================================================
100"""
101
102
103def message(level, text):
104    """ A wrapper method for logging debug messages. """
105    logger =  logging.getLogger('MARKDOWN')
106    if logger.handlers:
107        # The logger is configured
108        logger.log(level, text)
109        if level > WARN:
110            sys.exit(0)
111    elif level > WARN:
112        raise MarkdownException, text
113    else:
114        warnings.warn(text, MarkdownWarning)
115
116
117def isBlockLevel(tag):
118    """Check if the tag is a block level HTML tag."""
119    return BLOCK_LEVEL_ELEMENTS.match(tag)
120
121"""
122MISC AUXILIARY CLASSES
123=============================================================================
124"""
125
126class AtomicString(unicode):
127    """A string which should not be further processed."""
128    pass
129
130
131class MarkdownException(Exception):
132    """ A Markdown Exception. """
133    pass
134
135
136class MarkdownWarning(Warning):
137    """ A Markdown Warning. """
138    pass
139
140
141"""
142OVERALL DESIGN
143=============================================================================
144
145Markdown processing takes place in four steps:
146
1471. A bunch of "preprocessors" munge the input text.
1482. BlockParser() parses the high-level structural elements of the
149   pre-processed text into an ElementTree.
1503. A bunch of "treeprocessors" are run against the ElementTree. One such
151   treeprocessor runs InlinePatterns against the ElementTree, detecting inline
152   markup.
1534. Some post-processors are run against the text after the ElementTree has
154   been serialized into text.
1555. The output is written to a string.
156
157Those steps are put together by the Markdown() class.
158
159"""
160
161import preprocessors
162import blockprocessors
163import treeprocessors
164import inlinepatterns
165import postprocessors
166import blockparser
167import etree_loader
168import odict
169
170# Extensions should use "markdown.etree" instead of "etree" (or do `from
171# markdown import etree`).  Do not import it by yourself.
172
173etree = etree_loader.importETree()
174
175# Adds the ability to output html4
176import html4
177
178
179class Markdown:
180    """Convert Markdown to HTML."""
181
182    def __init__(self,
183                 extensions=[],
184                 extension_configs={},
185                 safe_mode = False,
186                 output_format=DEFAULT_OUTPUT_FORMAT):
187        """
188        Creates a new Markdown instance.
189
190        Keyword arguments:
191
192        * extensions: A list of extensions.
193           If they are of type string, the module mdx_name.py will be loaded.
194           If they are a subclass of markdown.Extension, they will be used
195           as-is.
196        * extension-configs: Configuration setting for extensions.
197        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
198        * output_format: Format of output. Supported formats are:
199            * "xhtml1": Outputs XHTML 1.x. Default.
200            * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
201            * "html4": Outputs HTML 4
202            * "html": Outputs latest supported version of HTML (currently HTML 4).
203            Note that it is suggested that the more specific formats ("xhtml1"
204            and "html4") be used as "xhtml" or "html" may change in the future
205            if it makes sense at that time.
206
207        """
208
209        self.safeMode = safe_mode
210        self.registeredExtensions = []
211        self.docType = ""
212        self.stripTopLevelTags = True
213
214        # Preprocessors
215        self.preprocessors = odict.OrderedDict()
216        self.preprocessors["html_block"] = \
217                preprocessors.HtmlBlockPreprocessor(self)
218        self.preprocessors["reference"] = \
219                preprocessors.ReferencePreprocessor(self)
220        # footnote preprocessor will be inserted with "<reference"
221
222        # Block processors - ran by the parser
223        self.parser = blockparser.BlockParser()
224        self.parser.blockprocessors['empty'] = \
225                blockprocessors.EmptyBlockProcessor(self.parser)
226        self.parser.blockprocessors['indent'] = \
227                blockprocessors.ListIndentProcessor(self.parser)
228        self.parser.blockprocessors['code'] = \
229                blockprocessors.CodeBlockProcessor(self.parser)
230        self.parser.blockprocessors['hashheader'] = \
231                blockprocessors.HashHeaderProcessor(self.parser)
232        self.parser.blockprocessors['setextheader'] = \
233                blockprocessors.SetextHeaderProcessor(self.parser)
234        self.parser.blockprocessors['hr'] = \
235                blockprocessors.HRProcessor(self.parser)
236        self.parser.blockprocessors['olist'] = \
237                blockprocessors.OListProcessor(self.parser)
238        self.parser.blockprocessors['ulist'] = \
239                blockprocessors.UListProcessor(self.parser)
240        self.parser.blockprocessors['quote'] = \
241                blockprocessors.BlockQuoteProcessor(self.parser)
242        self.parser.blockprocessors['paragraph'] = \
243                blockprocessors.ParagraphProcessor(self.parser)
244
245
246        #self.prePatterns = []
247
248        # Inline patterns - Run on the tree
249        self.inlinePatterns = odict.OrderedDict()
250        self.inlinePatterns["backtick"] = \
251                inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE)
252        self.inlinePatterns["escape"] = \
253                inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE)
254        self.inlinePatterns["reference"] = \
255            inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self)
256        self.inlinePatterns["link"] = \
257                inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self)
258        self.inlinePatterns["image_link"] = \
259                inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self)
260        self.inlinePatterns["image_reference"] = \
261            inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self)
262        self.inlinePatterns["autolink"] = \
263            inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self)
264        self.inlinePatterns["automail"] = \
265            inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self)
266        self.inlinePatterns["linebreak2"] = \
267            inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br')
268        self.inlinePatterns["linebreak"] = \
269            inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br')
270        self.inlinePatterns["html"] = \
271                inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self)
272        self.inlinePatterns["entity"] = \
273                inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self)
274        self.inlinePatterns["not_strong"] = \
275                inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE)
276        self.inlinePatterns["strong_em"] = \
277            inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em')
278        self.inlinePatterns["strong"] = \
279            inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong')
280        self.inlinePatterns["emphasis"] = \
281            inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em')
282        self.inlinePatterns["emphasis2"] = \
283            inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em')
284        # The order of the handlers matters!!!
285
286
287        # Tree processors - run once we have a basic parse.
288        self.treeprocessors = odict.OrderedDict()
289        self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self)
290        self.treeprocessors["prettify"] = \
291                treeprocessors.PrettifyTreeprocessor(self)
292
293        # Postprocessors - finishing touches.
294        self.postprocessors = odict.OrderedDict()
295        self.postprocessors["raw_html"] = \
296                postprocessors.RawHtmlPostprocessor(self)
297        self.postprocessors["amp_substitute"] = \
298                postprocessors.AndSubstitutePostprocessor()
299        # footnote postprocessor will be inserted with ">amp_substitute"
300
301        # Map format keys to serializers
302        self.output_formats = {
303            'html'  : html4.to_html_string,
304            'html4' : html4.to_html_string,
305            'xhtml' : etree.tostring,
306            'xhtml1': etree.tostring,
307        }
308
309        self.references = {}
310        self.htmlStash = preprocessors.HtmlStash()
311        self.registerExtensions(extensions = extensions,
312                                configs = extension_configs)
313        self.set_output_format(output_format)
314        self.reset()
315
316    def registerExtensions(self, extensions, configs):
317        """
318        Register extensions with this instance of Markdown.
319
320        Keyword aurguments:
321
322        * extensions: A list of extensions, which can either
323           be strings or objects.  See the docstring on Markdown.
324        * configs: A dictionary mapping module names to config options.
325
326        """
327        for ext in extensions:
328            if isinstance(ext, basestring):
329                ext = load_extension(ext, configs.get(ext, []))
330            if isinstance(ext, Extension):
331                try:
332                    ext.extendMarkdown(self, globals())
333                except NotImplementedError, e:
334                    message(ERROR, e)
335            else:
336                message(ERROR, 'Extension "%s.%s" must be of type: "markdown.Extension".' \
337                    % (ext.__class__.__module__, ext.__class__.__name__))
338
339    def registerExtension(self, extension):
340        """ This gets called by the extension """
341        self.registeredExtensions.append(extension)
342
343    def reset(self):
344        """
345        Resets all state variables so that we can start with a new text.
346        """
347        self.htmlStash.reset()
348        self.references.clear()
349
350        for extension in self.registeredExtensions:
351            extension.reset()
352
353    def set_output_format(self, format):
354        """ Set the output format for the class instance. """
355        try:
356            self.serializer = self.output_formats[format.lower()]
357        except KeyError:
358            message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \
359                               % (format, self.output_formats.keys()))
360
361    def convert(self, source):
362        """
363        Convert markdown to serialized XHTML or HTML.
364
365        Keyword arguments:
366
367        * source: Source text as a Unicode string.
368
369        """
370
371        # Fixup the source text
372        if not source.strip():
373            return u""  # a blank unicode string
374        try:
375            source = unicode(source)
376        except UnicodeDecodeError:
377            message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
378            return u""
379
380        source = source.replace(STX, "").replace(ETX, "")
381        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
382        source = re.sub(r'\n\s+\n', '\n\n', source)
383        source = source.expandtabs(TAB_LENGTH)
384
385        # Split into lines and run the line preprocessors.
386        self.lines = source.split("\n")
387        for prep in self.preprocessors.values():
388            self.lines = prep.run(self.lines)
389
390        # Parse the high-level elements.
391        root = self.parser.parseDocument(self.lines).getroot()
392
393        # Run the tree-processors
394        for treeprocessor in self.treeprocessors.values():
395            newRoot = treeprocessor.run(root)
396            if newRoot:
397                root = newRoot
398
399        # Serialize _properly_.  Strip top-level tags.
400        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
401        if self.stripTopLevelTags:
402            try:
403                start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
404                end = output.rindex('</%s>'%DOC_TAG)
405                output = output[start:end].strip()
406            except ValueError:
407                if output.strip().endswith('<%s />'%DOC_TAG):
408                    # We have an empty document
409                    output = ''
410                else:
411                    # We have a serious problem
412                    message(CRITICAL, 'Failed to strip top level tags.')
413
414        # Run the text post-processors
415        for pp in self.postprocessors.values():
416            output = pp.run(output)
417
418        return output.strip()
419
420    def convertFile(self, input=None, output=None, encoding=None):
421        """Converts a markdown file and returns the HTML as a unicode string.
422
423        Decodes the file using the provided encoding (defaults to utf-8),
424        passes the file content to markdown, and outputs the html to either
425        the provided stream or the file with provided name, using the same
426        encoding as the source file.
427
428        **Note:** This is the only place that decoding and encoding of unicode
429        takes place in Python-Markdown.  (All other code is unicode-in /
430        unicode-out.)
431
432        Keyword arguments:
433
434        * input: Name of source text file.
435        * output: Name of output file. Writes to stdout if `None`.
436        * encoding: Encoding of input and output files. Defaults to utf-8.
437
438        """
439
440        encoding = encoding or "utf-8"
441
442        # Read the source
443        input_file = codecs.open(input, mode="r", encoding=encoding)
444        text = input_file.read()
445        input_file.close()
446        text = text.lstrip(u'\ufeff') # remove the byte-order mark
447
448        # Convert
449        html = self.convert(text)
450
451        # Write to file or stdout
452        if isinstance(output, (str, unicode)):
453            output_file = codecs.open(output, "w", encoding=encoding)
454            output_file.write(html)
455            output_file.close()
456        else:
457            output.write(html.encode(encoding))
458
459
460"""
461Extensions
462-----------------------------------------------------------------------------
463"""
464
465class Extension:
466    """ Base class for extensions to subclass. """
467    def __init__(self, configs = {}):
468        """Create an instance of an Extention.
469
470        Keyword arguments:
471
472        * configs: A dict of configuration setting used by an Extension.
473        """
474        self.config = configs
475
476    def getConfig(self, key):
477        """ Return a setting for the given key or an empty string. """
478        if key in self.config:
479            return self.config[key][0]
480        else:
481            return ""
482
483    def getConfigInfo(self):
484        """ Return all config settings as a list of tuples. """
485        return [(key, self.config[key][1]) for key in self.config.keys()]
486
487    def setConfig(self, key, value):
488        """ Set a config setting for `key` with the given `value`. """
489        self.config[key][0] = value
490
491    def extendMarkdown(self, md, md_globals):
492        """
493        Add the various proccesors and patterns to the Markdown Instance.
494
495        This method must be overriden by every extension.
496
497        Keyword arguments:
498
499        * md: The Markdown instance.
500
501        * md_globals: Global variables in the markdown module namespace.
502
503        """
504        raise NotImplementedError, 'Extension "%s.%s" must define an "extendMarkdown"' \
505            'method.' % (self.__class__.__module__, self.__class__.__name__)
506
507
508def load_extension(ext_name, configs = []):
509    """Load extension by name, then return the module.
510
511    The extension name may contain arguments as part of the string in the
512    following format: "extname(key1=value1,key2=value2)"
513
514    """
515
516    # Parse extensions config params (ignore the order)
517    configs = dict(configs)
518    pos = ext_name.find("(") # find the first "("
519    if pos > 0:
520        ext_args = ext_name[pos+1:-1]
521        ext_name = ext_name[:pos]
522        pairs = [x.split("=") for x in ext_args.split(",")]
523        configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
524
525    # Setup the module names
526    ext_module = 'markdown.extensions'
527    module_name_new_style = '.'.join([ext_module, ext_name])
528    module_name_old_style = '_'.join(['mdx', ext_name])
529
530    # Try loading the extention first from one place, then another
531    try: # New style (markdown.extensons.<extension>)
532        module = __import__(module_name_new_style, {}, {}, [ext_module])
533    except ImportError:
534        try: # Old style (mdx.<extension>)
535            module = __import__(module_name_old_style)
536        except ImportError:
537           message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
538               % (ext_name, module_name_new_style, module_name_old_style))
539           # Return None so we don't try to initiate none-existant extension
540           return None
541
542    # If the module is loaded successfully, we expect it to define a
543    # function called makeExtension()
544    try:
545        return module.makeExtension(configs.items())
546    except AttributeError:
547        message(CRITICAL, "Failed to initiate extension '%s'" % ext_name)
548
549
550def load_extensions(ext_names):
551    """Loads multiple extensions"""
552    extensions = []
553    for ext_name in ext_names:
554        extension = load_extension(ext_name)
555        if extension:
556            extensions.append(extension)
557    return extensions
558
559
560"""
561EXPORTED FUNCTIONS
562=============================================================================
563
564Those are the two functions we really mean to export: markdown() and
565markdownFromFile().
566"""
567
568def markdown(text,
569             extensions = [],
570             safe_mode = False,
571             output_format = DEFAULT_OUTPUT_FORMAT):
572    """Convert a markdown string to HTML and return HTML as a unicode string.
573
574    This is a shortcut function for `Markdown` class to cover the most
575    basic use case.  It initializes an instance of Markdown, loads the
576    necessary extensions and runs the parser on the given text.
577
578    Keyword arguments:
579
580    * text: Markdown formatted text as Unicode or ASCII string.
581    * extensions: A list of extensions or extension names (may contain config args).
582    * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
583    * output_format: Format of output. Supported formats are:
584        * "xhtml1": Outputs XHTML 1.x. Default.
585        * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
586        * "html4": Outputs HTML 4
587        * "html": Outputs latest supported version of HTML (currently HTML 4).
588        Note that it is suggested that the more specific formats ("xhtml1"
589        and "html4") be used as "xhtml" or "html" may change in the future
590        if it makes sense at that time.
591
592    Returns: An HTML document as a string.
593
594    """
595    md = Markdown(extensions=load_extensions(extensions),
596                  safe_mode=safe_mode,
597                  output_format=output_format)
598    return md.convert(text)
599
600
601def markdownFromFile(input = None,
602                     output = None,
603                     extensions = [],
604                     encoding = None,
605                     safe_mode = False,
606                     output_format = DEFAULT_OUTPUT_FORMAT):
607    """Read markdown code from a file and write it to a file or a stream."""
608    md = Markdown(extensions=load_extensions(extensions),
609                  safe_mode=safe_mode,
610                  output_format=output_format)
611    md.convertFile(input, output, encoding)
612
613
614
615