1# markdown is released under the BSD license 2# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) 3# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 4# Copyright 2004 Manfred Stienstra (the original version) 5# 6# All rights reserved. 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions are met: 10# 11# * Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# * Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# * Neither the name of the <organization> nor the 17# names of its contributors may be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY 21# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT 24# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30# POSSIBILITY OF SUCH DAMAGE. 31 32 33""" 34INLINE PATTERNS 35============================================================================= 36 37Inline patterns such as *emphasis* are handled by means of auxiliary 38objects, one per pattern. Pattern objects must be instances of classes 39that extend markdown.Pattern. Each pattern object uses a single regular 40expression and needs support the following methods: 41 42 pattern.getCompiledRegExp() # returns a regular expression 43 44 pattern.handleMatch(m) # takes a match object and returns 45 # an ElementTree element or just plain text 46 47All of python markdown's built-in patterns subclass from Pattern, 48but you can add additional patterns that don't. 49 50Also note that all the regular expressions used by inline must 51capture the whole block. For this reason, they all start with 52'^(.*)' and end with '(.*)!'. In case with built-in expression 53Pattern takes care of adding the "^(.*)" and "(.*)!". 54 55Finally, the order in which regular expressions are applied is very 56important - e.g. if we first replace http://.../ links with <a> tags 57and _then_ try to replace inline html, we would end up with a mess. 58So, we apply the expressions in the following order: 59 60* escape and backticks have to go before everything else, so 61 that we can preempt any markdown patterns by escaping them. 62 63* then we handle auto-links (must be done before inline html) 64 65* then we handle inline HTML. At this point we will simply 66 replace all inline HTML strings with a placeholder and add 67 the actual HTML to a hash. 68 69* then inline images (must be done before links) 70 71* then bracketed links, first regular then reference-style 72 73* finally we apply strong and emphasis 74""" 75 76from __future__ import absolute_import 77from __future__ import unicode_literals 78from . import util 79from . import odict 80import re 81try: 82 from urllib.parse import urlparse, urlunparse 83except ImportError: 84 from urlparse import urlparse, urlunparse 85try: 86 from html import entities 87except ImportError: 88 import htmlentitydefs as entities 89 90 91def build_inlinepatterns(md_instance, **kwargs): 92 """ Build the default set of inline patterns for Markdown. """ 93 inlinePatterns = odict.OrderedDict() 94 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE) 95 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance) 96 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance) 97 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance) 98 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance) 99 inlinePatterns["image_reference"] = \ 100 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance) 101 inlinePatterns["short_reference"] = \ 102 ReferencePattern(SHORT_REF_RE, md_instance) 103 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance) 104 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance) 105 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br') 106 if md_instance.safeMode != 'escape': 107 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance) 108 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance) 109 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE) 110 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em') 111 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong') 112 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em') 113 if md_instance.smart_emphasis: 114 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em') 115 else: 116 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em') 117 return inlinePatterns 118 119""" 120The actual regular expressions for patterns 121----------------------------------------------------------------------------- 122""" 123 124NOBRACKET = r'[^\]\[]*' 125BRK = ( r'\[(' 126 + (NOBRACKET + r'(\[')*6 127 + (NOBRACKET+ r'\])*')*6 128 + NOBRACKET + r')\]' ) 129NOIMG = r'(?<!\!)' 130 131BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` 132ESCAPE_RE = r'\\(.)' # \< 133EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis* 134STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong** 135STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong*** 136SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_ 137EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_ 138LINK_RE = NOIMG + BRK + \ 139r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)''' 140# [text](url) or [text](<url>) or [text](url "title") 141 142IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' 143# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) 144REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3] 145SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google] 146IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2] 147NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _ 148AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com> 149AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com> 150 151HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> 152ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & 153LINE_BREAK_RE = r' \n' # two spaces at end of line 154 155 156def dequote(string): 157 """Remove quotes from around a string.""" 158 if ( ( string.startswith('"') and string.endswith('"')) 159 or (string.startswith("'") and string.endswith("'")) ): 160 return string[1:-1] 161 else: 162 return string 163 164ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} 165 166def handleAttributes(text, parent): 167 """Set values of an element based on attribute definitions ({@id=123}).""" 168 def attributeCallback(match): 169 parent.set(match.group(1), match.group(2).replace('\n', ' ')) 170 return ATTR_RE.sub(attributeCallback, text) 171 172 173""" 174The pattern classes 175----------------------------------------------------------------------------- 176""" 177 178class Pattern(object): 179 """Base class that inline patterns subclass. """ 180 181 def __init__(self, pattern, markdown_instance=None): 182 """ 183 Create an instant of an inline pattern. 184 185 Keyword arguments: 186 187 * pattern: A regular expression that matches a pattern 188 189 """ 190 self.pattern = pattern 191 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 192 re.DOTALL | re.UNICODE) 193 194 # Api for Markdown to pass safe_mode into instance 195 self.safe_mode = False 196 if markdown_instance: 197 self.markdown = markdown_instance 198 199 def getCompiledRegExp(self): 200 """ Return a compiled regular expression. """ 201 return self.compiled_re 202 203 def handleMatch(self, m): 204 """Return a ElementTree element from the given match. 205 206 Subclasses should override this method. 207 208 Keyword arguments: 209 210 * m: A re match object containing a match of the pattern. 211 212 """ 213 pass 214 215 def type(self): 216 """ Return class name, to define pattern type """ 217 return self.__class__.__name__ 218 219 def unescape(self, text): 220 """ Return unescaped text given text with an inline placeholder. """ 221 try: 222 stash = self.markdown.treeprocessors['inline'].stashed_nodes 223 except KeyError: 224 return text 225 def itertext(el): 226 ' Reimplement Element.itertext for older python versions ' 227 tag = el.tag 228 if not isinstance(tag, util.string_type) and tag is not None: 229 return 230 if el.text: 231 yield el.text 232 for e in el: 233 for s in itertext(e): 234 yield s 235 if e.tail: 236 yield e.tail 237 def get_stash(m): 238 id = m.group(1) 239 if id in stash: 240 value = stash.get(id) 241 if isinstance(value, util.string_type): 242 return value 243 else: 244 # An etree Element - return text content only 245 return ''.join(itertext(value)) 246 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 247 248 249class SimpleTextPattern(Pattern): 250 """ Return a simple text of group(2) of a Pattern. """ 251 def handleMatch(self, m): 252 text = m.group(2) 253 if text == util.INLINE_PLACEHOLDER_PREFIX: 254 return None 255 return text 256 257 258class EscapePattern(Pattern): 259 """ Return an escaped character. """ 260 261 def handleMatch(self, m): 262 char = m.group(2) 263 if char in self.markdown.ESCAPED_CHARS: 264 return '%s%s%s' % (util.STX, ord(char), util.ETX) 265 else: 266 return '\\%s' % char 267 268 269class SimpleTagPattern(Pattern): 270 """ 271 Return element of type `tag` with a text attribute of group(3) 272 of a Pattern. 273 274 """ 275 def __init__ (self, pattern, tag): 276 Pattern.__init__(self, pattern) 277 self.tag = tag 278 279 def handleMatch(self, m): 280 el = util.etree.Element(self.tag) 281 el.text = m.group(3) 282 return el 283 284 285class SubstituteTagPattern(SimpleTagPattern): 286 """ Return an element of type `tag` with no children. """ 287 def handleMatch (self, m): 288 return util.etree.Element(self.tag) 289 290 291class BacktickPattern(Pattern): 292 """ Return a `<code>` element containing the matching text. """ 293 def __init__ (self, pattern): 294 Pattern.__init__(self, pattern) 295 self.tag = "code" 296 297 def handleMatch(self, m): 298 el = util.etree.Element(self.tag) 299 el.text = util.AtomicString(m.group(3).strip()) 300 return el 301 302 303class DoubleTagPattern(SimpleTagPattern): 304 """Return a ElementTree element nested in tag2 nested in tag1. 305 306 Useful for strong emphasis etc. 307 308 """ 309 def handleMatch(self, m): 310 tag1, tag2 = self.tag.split(",") 311 el1 = util.etree.Element(tag1) 312 el2 = util.etree.SubElement(el1, tag2) 313 el2.text = m.group(3) 314 return el1 315 316 317class HtmlPattern(Pattern): 318 """ Store raw inline html and return a placeholder. """ 319 def handleMatch (self, m): 320 rawhtml = self.unescape(m.group(2)) 321 place_holder = self.markdown.htmlStash.store(rawhtml) 322 return place_holder 323 324 def unescape(self, text): 325 """ Return unescaped text given text with an inline placeholder. """ 326 try: 327 stash = self.markdown.treeprocessors['inline'].stashed_nodes 328 except KeyError: 329 return text 330 def get_stash(m): 331 id = m.group(1) 332 value = stash.get(id) 333 if value is not None: 334 try: 335 return self.markdown.serializer(value) 336 except: 337 return '\%s' % value 338 339 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 340 341 342class LinkPattern(Pattern): 343 """ Return a link element from the given match. """ 344 def handleMatch(self, m): 345 el = util.etree.Element("a") 346 el.text = m.group(2) 347 title = m.group(13) 348 href = m.group(9) 349 350 if href: 351 if href[0] == "<": 352 href = href[1:-1] 353 el.set("href", self.sanitize_url(self.unescape(href.strip()))) 354 else: 355 el.set("href", "") 356 357 if title: 358 title = dequote(self.unescape(title)) 359 el.set("title", title) 360 return el 361 362 def sanitize_url(self, url): 363 """ 364 Sanitize a url against xss attacks in "safe_mode". 365 366 Rather than specifically blacklisting `javascript:alert("XSS")` and all 367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known 368 safe url formats. Most urls contain a network location, however some 369 are known not to (i.e.: mailto links). Script urls do not contain a 370 location. Additionally, for `javascript:...`, the scheme would be 371 "javascript" but some aliases will appear to `urlparse()` to have no 372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no 373 scheme. Therefore we must check "path", "parameters", "query" and 374 "fragment" for any literal colons. We don't check "scheme" for colons 375 because it *should* never have any and "netloc" must allow the form: 376 `username:password@host:port`. 377 378 """ 379 url = url.replace(' ', '%20') 380 if not self.markdown.safeMode: 381 # Return immediately bipassing parsing. 382 return url 383 384 try: 385 scheme, netloc, path, params, query, fragment = url = urlparse(url) 386 except ValueError: 387 # Bad url - so bad it couldn't be parsed. 388 return '' 389 390 locless_schemes = ['', 'mailto', 'news'] 391 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps'] 392 if scheme not in allowed_schemes: 393 # Not a known (allowed) scheme. Not safe. 394 return '' 395 396 if netloc == '' and scheme not in locless_schemes: 397 # This should not happen. Treat as suspect. 398 return '' 399 400 for part in url[2:]: 401 if ":" in part: 402 # A colon in "path", "parameters", "query" or "fragment" is suspect. 403 return '' 404 405 # Url passes all tests. Return url as-is. 406 return urlunparse(url) 407 408class ImagePattern(LinkPattern): 409 """ Return a img element from the given match. """ 410 def handleMatch(self, m): 411 el = util.etree.Element("img") 412 src_parts = m.group(9).split() 413 if src_parts: 414 src = src_parts[0] 415 if src[0] == "<" and src[-1] == ">": 416 src = src[1:-1] 417 el.set('src', self.sanitize_url(self.unescape(src))) 418 else: 419 el.set('src', "") 420 if len(src_parts) > 1: 421 el.set('title', dequote(self.unescape(" ".join(src_parts[1:])))) 422 423 if self.markdown.enable_attributes: 424 truealt = handleAttributes(m.group(2), el) 425 else: 426 truealt = m.group(2) 427 428 el.set('alt', self.unescape(truealt)) 429 return el 430 431class ReferencePattern(LinkPattern): 432 """ Match to a stored reference and return link element. """ 433 434 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE) 435 436 def handleMatch(self, m): 437 try: 438 id = m.group(9).lower() 439 except IndexError: 440 id = None 441 if not id: 442 # if we got something like "[Google][]" or "[Goggle]" 443 # we'll use "google" as the id 444 id = m.group(2).lower() 445 446 # Clean up linebreaks in id 447 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 448 if not id in self.markdown.references: # ignore undefined refs 449 return None 450 href, title = self.markdown.references[id] 451 452 text = m.group(2) 453 return self.makeTag(href, title, text) 454 455 def makeTag(self, href, title, text): 456 el = util.etree.Element('a') 457 458 el.set('href', self.sanitize_url(href)) 459 if title: 460 el.set('title', title) 461 462 el.text = text 463 return el 464 465 466class ImageReferencePattern(ReferencePattern): 467 """ Match to a stored reference and return img element. """ 468 def makeTag(self, href, title, text): 469 el = util.etree.Element("img") 470 el.set("src", self.sanitize_url(href)) 471 if title: 472 el.set("title", title) 473 474 if self.markdown.enable_attributes: 475 text = handleAttributes(text, el) 476 477 el.set("alt", self.unescape(text)) 478 return el 479 480 481class AutolinkPattern(Pattern): 482 """ Return a link Element given an autolink (`<http://example/com>`). """ 483 def handleMatch(self, m): 484 el = util.etree.Element("a") 485 el.set('href', self.unescape(m.group(2))) 486 el.text = util.AtomicString(m.group(2)) 487 return el 488 489class AutomailPattern(Pattern): 490 """ 491 Return a mailto link Element given an automail link (`<foo@example.com>`). 492 """ 493 def handleMatch(self, m): 494 el = util.etree.Element('a') 495 email = self.unescape(m.group(2)) 496 if email.startswith("mailto:"): 497 email = email[len("mailto:"):] 498 499 def codepoint2name(code): 500 """Return entity definition by code, or the code if not defined.""" 501 entity = entities.codepoint2name.get(code) 502 if entity: 503 return "%s%s;" % (util.AMP_SUBSTITUTE, entity) 504 else: 505 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 506 507 letters = [codepoint2name(ord(letter)) for letter in email] 508 el.text = util.AtomicString(''.join(letters)) 509 510 mailto = "mailto:" + email 511 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 512 ord(letter) for letter in mailto]) 513 el.set('href', mailto) 514 return el 515 516