1""" 2INLINE PATTERNS 3============================================================================= 4 5Inline patterns such as *emphasis* are handled by means of auxiliary 6objects, one per pattern. Pattern objects must be instances of classes 7that extend markdown.Pattern. Each pattern object uses a single regular 8expression and needs support the following methods: 9 10 pattern.getCompiledRegExp() # returns a regular expression 11 12 pattern.handleMatch(m) # takes a match object and returns 13 # an ElementTree element or just plain text 14 15All of python markdown's built-in patterns subclass from Pattern, 16but you can add additional patterns that don't. 17 18Also note that all the regular expressions used by inline must 19capture the whole block. For this reason, they all start with 20'^(.*)' and end with '(.*)!'. In case with built-in expression 21Pattern takes care of adding the "^(.*)" and "(.*)!". 22 23Finally, the order in which regular expressions are applied is very 24important - e.g. if we first replace http://.../ links with <a> tags 25and _then_ try to replace inline html, we would end up with a mess. 26So, we apply the expressions in the following order: 27 28* escape and backticks have to go before everything else, so 29 that we can preempt any markdown patterns by escaping them. 30 31* then we handle auto-links (must be done before inline html) 32 33* then we handle inline HTML. At this point we will simply 34 replace all inline HTML strings with a placeholder and add 35 the actual HTML to a hash. 36 37* then inline images (must be done before links) 38 39* then bracketed links, first regular then reference-style 40 41* finally we apply strong and emphasis 42""" 43 44import markdown 45import re 46from urlparse import urlparse, urlunparse 47import sys 48if sys.version >= "3.0": 49 from html import entities as htmlentitydefs 50else: 51 import htmlentitydefs 52 53""" 54The actual regular expressions for patterns 55----------------------------------------------------------------------------- 56""" 57 58NOBRACKET = r'[^\]\[]*' 59BRK = ( r'\[(' 60 + (NOBRACKET + r'(\[')*6 61 + (NOBRACKET+ r'\])*')*6 62 + NOBRACKET + r')\]' ) 63NOIMG = r'(?<!\!)' 64 65BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` 66ESCAPE_RE = r'\\(.)' # \< 67EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis* 68STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong** 69STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong*** 70 71if markdown.SMART_EMPHASIS: 72 EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)' # _emphasis_ 73else: 74 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_ 75 76LINK_RE = NOIMG + BRK + \ 77r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)''' 78# [text](url) or [text](<url>) 79 80IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' 81# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) 82REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] 83IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] 84NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _ 85AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com> 86AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com> 87 88HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> 89ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & 90LINE_BREAK_RE = r' \n' # two spaces at end of line 91LINE_BREAK_2_RE = r' $' # two spaces at end of text 92 93 94def dequote(string): 95 """Remove quotes from around a string.""" 96 if ( ( string.startswith('"') and string.endswith('"')) 97 or (string.startswith("'") and string.endswith("'")) ): 98 return string[1:-1] 99 else: 100 return string 101 102ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} 103 104def handleAttributes(text, parent): 105 """Set values of an element based on attribute definitions ({@id=123}).""" 106 def attributeCallback(match): 107 parent.set(match.group(1), match.group(2).replace('\n', ' ')) 108 return ATTR_RE.sub(attributeCallback, text) 109 110 111""" 112The pattern classes 113----------------------------------------------------------------------------- 114""" 115 116class Pattern: 117 """Base class that inline patterns subclass. """ 118 119 def __init__ (self, pattern, markdown_instance=None): 120 """ 121 Create an instant of an inline pattern. 122 123 Keyword arguments: 124 125 * pattern: A regular expression that matches a pattern 126 127 """ 128 self.pattern = pattern 129 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) 130 131 # Api for Markdown to pass safe_mode into instance 132 self.safe_mode = False 133 if markdown_instance: 134 self.markdown = markdown_instance 135 136 def getCompiledRegExp (self): 137 """ Return a compiled regular expression. """ 138 return self.compiled_re 139 140 def handleMatch(self, m): 141 """Return a ElementTree element from the given match. 142 143 Subclasses should override this method. 144 145 Keyword arguments: 146 147 * m: A re match object containing a match of the pattern. 148 149 """ 150 pass 151 152 def type(self): 153 """ Return class name, to define pattern type """ 154 return self.__class__.__name__ 155 156BasePattern = Pattern # for backward compatibility 157 158class SimpleTextPattern (Pattern): 159 """ Return a simple text of group(2) of a Pattern. """ 160 def handleMatch(self, m): 161 text = m.group(2) 162 if text == markdown.INLINE_PLACEHOLDER_PREFIX: 163 return None 164 return text 165 166class SimpleTagPattern (Pattern): 167 """ 168 Return element of type `tag` with a text attribute of group(3) 169 of a Pattern. 170 171 """ 172 def __init__ (self, pattern, tag): 173 Pattern.__init__(self, pattern) 174 self.tag = tag 175 176 def handleMatch(self, m): 177 el = markdown.etree.Element(self.tag) 178 el.text = m.group(3) 179 return el 180 181 182class SubstituteTagPattern (SimpleTagPattern): 183 """ Return a eLement of type `tag` with no children. """ 184 def handleMatch (self, m): 185 return markdown.etree.Element(self.tag) 186 187 188class BacktickPattern (Pattern): 189 """ Return a `<code>` element containing the matching text. """ 190 def __init__ (self, pattern): 191 Pattern.__init__(self, pattern) 192 self.tag = "code" 193 194 def handleMatch(self, m): 195 el = markdown.etree.Element(self.tag) 196 el.text = markdown.AtomicString(m.group(3).strip()) 197 return el 198 199 200class DoubleTagPattern (SimpleTagPattern): 201 """Return a ElementTree element nested in tag2 nested in tag1. 202 203 Useful for strong emphasis etc. 204 205 """ 206 def handleMatch(self, m): 207 tag1, tag2 = self.tag.split(",") 208 el1 = markdown.etree.Element(tag1) 209 el2 = markdown.etree.SubElement(el1, tag2) 210 el2.text = m.group(3) 211 return el1 212 213 214class HtmlPattern (Pattern): 215 """ Store raw inline html and return a placeholder. """ 216 def handleMatch (self, m): 217 rawhtml = m.group(2) 218 inline = True 219 place_holder = self.markdown.htmlStash.store(rawhtml) 220 return place_holder 221 222 223class LinkPattern (Pattern): 224 """ Return a link element from the given match. """ 225 def handleMatch(self, m): 226 el = markdown.etree.Element("a") 227 el.text = m.group(2) 228 title = m.group(11) 229 href = m.group(9) 230 231 if href: 232 if href[0] == "<": 233 href = href[1:-1] 234 el.set("href", self.sanitize_url(href.strip())) 235 else: 236 el.set("href", "") 237 238 if title: 239 title = dequote(title) #.replace('"', """) 240 el.set("title", title) 241 return el 242 243 def sanitize_url(self, url): 244 """ 245 Sanitize a url against xss attacks in "safe_mode". 246 247 Rather than specifically blacklisting `javascript:alert("XSS")` and all 248 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known 249 safe url formats. Most urls contain a network location, however some 250 are known not to (i.e.: mailto links). Script urls do not contain a 251 location. Additionally, for `javascript:...`, the scheme would be 252 "javascript" but some aliases will appear to `urlparse()` to have no 253 scheme. On top of that relative links (i.e.: "foo/bar.html") have no 254 scheme. Therefore we must check "path", "parameters", "query" and 255 "fragment" for any literal colons. We don't check "scheme" for colons 256 because it *should* never have any and "netloc" must allow the form: 257 `username:password@host:port`. 258 259 """ 260 locless_schemes = ['', 'mailto', 'news'] 261 scheme, netloc, path, params, query, fragment = url = urlparse(url) 262 safe_url = False 263 if netloc != '' or scheme in locless_schemes: 264 safe_url = True 265 266 for part in url[2:]: 267 if ":" in part: 268 safe_url = False 269 270 if self.markdown.safeMode and not safe_url: 271 return '' 272 else: 273 return urlunparse(url) 274 275class ImagePattern(LinkPattern): 276 """ Return a img element from the given match. """ 277 def handleMatch(self, m): 278 el = markdown.etree.Element("img") 279 src_parts = m.group(9).split() 280 if src_parts: 281 src = src_parts[0] 282 if src[0] == "<" and src[-1] == ">": 283 src = src[1:-1] 284 el.set('src', self.sanitize_url(src)) 285 else: 286 el.set('src', "") 287 if len(src_parts) > 1: 288 el.set('title', dequote(" ".join(src_parts[1:]))) 289 290 if markdown.ENABLE_ATTRIBUTES: 291 truealt = handleAttributes(m.group(2), el) 292 else: 293 truealt = m.group(2) 294 295 el.set('alt', truealt) 296 return el 297 298class ReferencePattern(LinkPattern): 299 """ Match to a stored reference and return link element. """ 300 def handleMatch(self, m): 301 if m.group(9): 302 id = m.group(9).lower() 303 else: 304 # if we got something like "[Google][]" 305 # we'll use "google" as the id 306 id = m.group(2).lower() 307 308 if not id in self.markdown.references: # ignore undefined refs 309 return None 310 href, title = self.markdown.references[id] 311 312 text = m.group(2) 313 return self.makeTag(href, title, text) 314 315 def makeTag(self, href, title, text): 316 el = markdown.etree.Element('a') 317 318 el.set('href', self.sanitize_url(href)) 319 if title: 320 el.set('title', title) 321 322 el.text = text 323 return el 324 325 326class ImageReferencePattern (ReferencePattern): 327 """ Match to a stored reference and return img element. """ 328 def makeTag(self, href, title, text): 329 el = markdown.etree.Element("img") 330 el.set("src", self.sanitize_url(href)) 331 if title: 332 el.set("title", title) 333 el.set("alt", text) 334 return el 335 336 337class AutolinkPattern (Pattern): 338 """ Return a link Element given an autolink (`<http://example/com>`). """ 339 def handleMatch(self, m): 340 el = markdown.etree.Element("a") 341 el.set('href', m.group(2)) 342 el.text = markdown.AtomicString(m.group(2)) 343 return el 344 345class AutomailPattern (Pattern): 346 """ 347 Return a mailto link Element given an automail link (`<foo@example.com>`). 348 """ 349 def handleMatch(self, m): 350 el = markdown.etree.Element('a') 351 email = m.group(2) 352 if email.startswith("mailto:"): 353 email = email[len("mailto:"):] 354 355 def codepoint2name(code): 356 """Return entity definition by code, or the code if not defined.""" 357 entity = htmlentitydefs.codepoint2name.get(code) 358 if entity: 359 return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity) 360 else: 361 return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code) 362 363 letters = [codepoint2name(ord(letter)) for letter in email] 364 el.text = markdown.AtomicString(''.join(letters)) 365 366 mailto = "mailto:" + email 367 mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' % 368 ord(letter) for letter in mailto]) 369 el.set('href', mailto) 370 return el 371 372