1""" 2Python Markdown 3 4A Python implementation of John Gruber's Markdown. 5 6Documentation: https://python-markdown.github.io/ 7GitHub: https://github.com/Python-Markdown/markdown/ 8PyPI: https://pypi.org/project/Markdown/ 9 10Started by Manfred Stienstra (http://www.dwerg.net/). 11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 12Currently maintained by Waylan Limberg (https://github.com/waylan), 13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 14 15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) 16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 17Copyright 2004 Manfred Stienstra (the original version) 18 19License: BSD (see LICENSE.md for details). 20 21INLINE PATTERNS 22============================================================================= 23 24Inline patterns such as *emphasis* are handled by means of auxiliary 25objects, one per pattern. Pattern objects must be instances of classes 26that extend markdown.Pattern. Each pattern object uses a single regular 27expression and needs support the following methods: 28 29 pattern.getCompiledRegExp() # returns a regular expression 30 31 pattern.handleMatch(m) # takes a match object and returns 32 # an ElementTree element or just plain text 33 34All of python markdown's built-in patterns subclass from Pattern, 35but you can add additional patterns that don't. 36 37Also note that all the regular expressions used by inline must 38capture the whole block. For this reason, they all start with 39'^(.*)' and end with '(.*)!'. In case with built-in expression 40Pattern takes care of adding the "^(.*)" and "(.*)!". 41 42Finally, the order in which regular expressions are applied is very 43important - e.g. if we first replace http://.../ links with <a> tags 44and _then_ try to replace inline html, we would end up with a mess. 45So, we apply the expressions in the following order: 46 47* escape and backticks have to go before everything else, so 48 that we can preempt any markdown patterns by escaping them. 49 50* then we handle auto-links (must be done before inline html) 51 52* then we handle inline HTML. At this point we will simply 53 replace all inline HTML strings with a placeholder and add 54 the actual HTML to a hash. 55 56* then inline images (must be done before links) 57 58* then bracketed links, first regular then reference-style 59 60* finally we apply strong and emphasis 61""" 62 63from . import util 64from collections import namedtuple 65import re 66import xml.etree.ElementTree as etree 67try: # pragma: no cover 68 from html import entities 69except ImportError: # pragma: no cover 70 import htmlentitydefs as entities 71 72 73def build_inlinepatterns(md, **kwargs): 74 """ Build the default set of inline patterns for Markdown. """ 75 inlinePatterns = util.Registry() 76 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) 77 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) 78 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) 79 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) 80 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) 81 inlinePatterns.register( 82 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 83 ) 84 inlinePatterns.register( 85 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 86 ) 87 inlinePatterns.register( 88 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 89 ) 90 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) 91 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) 92 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) 93 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) 94 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) 95 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) 96 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) 97 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) 98 return inlinePatterns 99 100 101""" 102The actual regular expressions for patterns 103----------------------------------------------------------------------------- 104""" 105 106NOIMG = r'(?<!\!)' 107 108# `e=f()` or ``e=f("`")`` 109BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' 110 111# \< 112ESCAPE_RE = r'\\(.)' 113 114# *emphasis* 115EMPHASIS_RE = r'(\*)([^\*]+)\1' 116 117# **strong** 118STRONG_RE = r'(\*{2})(.+?)\1' 119 120# __smart__strong__ 121SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' 122 123# _smart_emphasis_ 124SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' 125 126# __strong _em__ 127SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' 128 129# ***strongem*** or ***em*strong** 130EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' 131 132# ___strongem___ or ___em_strong__ 133EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' 134 135# ***strong**em* 136STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' 137 138# ___strong__em_ 139STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' 140 141# **strong*em*** 142STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' 143 144# [text](url) or [text](<url>) or [text](url "title") 145LINK_RE = NOIMG + r'\[' 146 147# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) 148IMAGE_LINK_RE = r'\!\[' 149 150# [Google][3] 151REFERENCE_RE = LINK_RE 152 153# ![alt text][2] 154IMAGE_REFERENCE_RE = IMAGE_LINK_RE 155 156# stand-alone * or _ 157NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))' 158 159# <http://www.123.com> 160AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' 161 162# <me@example.com> 163AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' 164 165# <...> 166HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)' 167 168# "&" (decimal) or "&" (hex) or "&" (named) 169ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' 170 171# two spaces at end of line 172LINE_BREAK_RE = r' \n' 173 174 175def dequote(string): 176 """Remove quotes from around a string.""" 177 if ((string.startswith('"') and string.endswith('"')) or 178 (string.startswith("'") and string.endswith("'"))): 179 return string[1:-1] 180 else: 181 return string 182 183 184class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])): 185 """Emphasis/strong pattern item.""" 186 187 188""" 189The pattern classes 190----------------------------------------------------------------------------- 191""" 192 193 194class Pattern: # pragma: no cover 195 """Base class that inline patterns subclass. """ 196 197 ANCESTOR_EXCLUDES = tuple() 198 199 def __init__(self, pattern, md=None): 200 """ 201 Create an instant of an inline pattern. 202 203 Keyword arguments: 204 205 * pattern: A regular expression that matches a pattern 206 207 """ 208 self.pattern = pattern 209 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, 210 re.DOTALL | re.UNICODE) 211 212 self.md = md 213 214 def getCompiledRegExp(self): 215 """ Return a compiled regular expression. """ 216 return self.compiled_re 217 218 def handleMatch(self, m): 219 """Return a ElementTree element from the given match. 220 221 Subclasses should override this method. 222 223 Keyword arguments: 224 225 * m: A re match object containing a match of the pattern. 226 227 """ 228 pass # pragma: no cover 229 230 def type(self): 231 """ Return class name, to define pattern type """ 232 return self.__class__.__name__ 233 234 def unescape(self, text): 235 """ Return unescaped text given text with an inline placeholder. """ 236 try: 237 stash = self.md.treeprocessors['inline'].stashed_nodes 238 except KeyError: # pragma: no cover 239 return text 240 241 def get_stash(m): 242 id = m.group(1) 243 if id in stash: 244 value = stash.get(id) 245 if isinstance(value, str): 246 return value 247 else: 248 # An etree Element - return text content only 249 return ''.join(value.itertext()) 250 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 251 252 253class InlineProcessor(Pattern): 254 """ 255 Base class that inline patterns subclass. 256 257 This is the newer style inline processor that uses a more 258 efficient and flexible search approach. 259 """ 260 261 def __init__(self, pattern, md=None): 262 """ 263 Create an instant of an inline pattern. 264 265 Keyword arguments: 266 267 * pattern: A regular expression that matches a pattern 268 269 """ 270 self.pattern = pattern 271 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) 272 273 # Api for Markdown to pass safe_mode into instance 274 self.safe_mode = False 275 self.md = md 276 277 def handleMatch(self, m, data): 278 """Return a ElementTree element from the given match and the 279 start and end index of the matched text. 280 281 If `start` and/or `end` are returned as `None`, it will be 282 assumed that the processor did not find a valid region of text. 283 284 Subclasses should override this method. 285 286 Keyword arguments: 287 288 * m: A re match object containing a match of the pattern. 289 * data: The buffer current under analysis 290 291 Returns: 292 293 * el: The ElementTree element, text or None. 294 * start: The start of the region that has been matched or None. 295 * end: The end of the region that has been matched or None. 296 297 """ 298 pass # pragma: no cover 299 300 301class SimpleTextPattern(Pattern): # pragma: no cover 302 """ Return a simple text of group(2) of a Pattern. """ 303 def handleMatch(self, m): 304 return m.group(2) 305 306 307class SimpleTextInlineProcessor(InlineProcessor): 308 """ Return a simple text of group(1) of a Pattern. """ 309 def handleMatch(self, m, data): 310 return m.group(1), m.start(0), m.end(0) 311 312 313class EscapeInlineProcessor(InlineProcessor): 314 """ Return an escaped character. """ 315 316 def handleMatch(self, m, data): 317 char = m.group(1) 318 if char in self.md.ESCAPED_CHARS: 319 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) 320 else: 321 return None, m.start(0), m.end(0) 322 323 324class SimpleTagPattern(Pattern): # pragma: no cover 325 """ 326 Return element of type `tag` with a text attribute of group(3) 327 of a Pattern. 328 329 """ 330 def __init__(self, pattern, tag): 331 Pattern.__init__(self, pattern) 332 self.tag = tag 333 334 def handleMatch(self, m): 335 el = etree.Element(self.tag) 336 el.text = m.group(3) 337 return el 338 339 340class SimpleTagInlineProcessor(InlineProcessor): 341 """ 342 Return element of type `tag` with a text attribute of group(2) 343 of a Pattern. 344 345 """ 346 def __init__(self, pattern, tag): 347 InlineProcessor.__init__(self, pattern) 348 self.tag = tag 349 350 def handleMatch(self, m, data): # pragma: no cover 351 el = etree.Element(self.tag) 352 el.text = m.group(2) 353 return el, m.start(0), m.end(0) 354 355 356class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover 357 """ Return an element of type `tag` with no children. """ 358 def handleMatch(self, m): 359 return etree.Element(self.tag) 360 361 362class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): 363 """ Return an element of type `tag` with no children. """ 364 def handleMatch(self, m, data): 365 return etree.Element(self.tag), m.start(0), m.end(0) 366 367 368class BacktickInlineProcessor(InlineProcessor): 369 """ Return a `<code>` element containing the matching text. """ 370 def __init__(self, pattern): 371 InlineProcessor.__init__(self, pattern) 372 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) 373 self.tag = 'code' 374 375 def handleMatch(self, m, data): 376 if m.group(3): 377 el = etree.Element(self.tag) 378 el.text = util.AtomicString(util.code_escape(m.group(3).strip())) 379 return el, m.start(0), m.end(0) 380 else: 381 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) 382 383 384class DoubleTagPattern(SimpleTagPattern): # pragma: no cover 385 """Return a ElementTree element nested in tag2 nested in tag1. 386 387 Useful for strong emphasis etc. 388 389 """ 390 def handleMatch(self, m): 391 tag1, tag2 = self.tag.split(",") 392 el1 = etree.Element(tag1) 393 el2 = etree.SubElement(el1, tag2) 394 el2.text = m.group(3) 395 if len(m.groups()) == 5: 396 el2.tail = m.group(4) 397 return el1 398 399 400class DoubleTagInlineProcessor(SimpleTagInlineProcessor): 401 """Return a ElementTree element nested in tag2 nested in tag1. 402 403 Useful for strong emphasis etc. 404 405 """ 406 def handleMatch(self, m, data): # pragma: no cover 407 tag1, tag2 = self.tag.split(",") 408 el1 = etree.Element(tag1) 409 el2 = etree.SubElement(el1, tag2) 410 el2.text = m.group(2) 411 if len(m.groups()) == 3: 412 el2.tail = m.group(3) 413 return el1, m.start(0), m.end(0) 414 415 416class HtmlInlineProcessor(InlineProcessor): 417 """ Store raw inline html and return a placeholder. """ 418 def handleMatch(self, m, data): 419 rawhtml = self.unescape(m.group(1)) 420 place_holder = self.md.htmlStash.store(rawhtml) 421 return place_holder, m.start(0), m.end(0) 422 423 def unescape(self, text): 424 """ Return unescaped text given text with an inline placeholder. """ 425 try: 426 stash = self.md.treeprocessors['inline'].stashed_nodes 427 except KeyError: # pragma: no cover 428 return text 429 430 def get_stash(m): 431 id = m.group(1) 432 value = stash.get(id) 433 if value is not None: 434 try: 435 return self.md.serializer(value) 436 except Exception: 437 return r'\%s' % value 438 439 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 440 441 442class AsteriskProcessor(InlineProcessor): 443 """Emphasis processor for handling strong and em matches inside asterisks.""" 444 445 PATTERNS = [ 446 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 447 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 448 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 449 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 450 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 451 ] 452 453 def build_single(self, m, tag, idx): 454 """Return single tag.""" 455 el1 = etree.Element(tag) 456 text = m.group(2) 457 self.parse_sub_patterns(text, el1, None, idx) 458 return el1 459 460 def build_double(self, m, tags, idx): 461 """Return double tag.""" 462 463 tag1, tag2 = tags.split(",") 464 el1 = etree.Element(tag1) 465 el2 = etree.Element(tag2) 466 text = m.group(2) 467 self.parse_sub_patterns(text, el2, None, idx) 468 el1.append(el2) 469 if len(m.groups()) == 3: 470 text = m.group(3) 471 self.parse_sub_patterns(text, el1, el2, idx) 472 return el1 473 474 def build_double2(self, m, tags, idx): 475 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" 476 477 tag1, tag2 = tags.split(",") 478 el1 = etree.Element(tag1) 479 el2 = etree.Element(tag2) 480 text = m.group(2) 481 self.parse_sub_patterns(text, el1, None, idx) 482 text = m.group(3) 483 el1.append(el2) 484 self.parse_sub_patterns(text, el2, None, idx) 485 return el1 486 487 def parse_sub_patterns(self, data, parent, last, idx): 488 """ 489 Parses sub patterns. 490 491 `data` (`str`): 492 text to evaluate. 493 494 `parent` (`etree.Element`): 495 Parent to attach text and sub elements to. 496 497 `last` (`etree.Element`): 498 Last appended child to parent. Can also be None if parent has no children. 499 500 `idx` (`int`): 501 Current pattern index that was used to evaluate the parent. 502 503 """ 504 505 offset = 0 506 pos = 0 507 508 length = len(data) 509 while pos < length: 510 # Find the start of potential emphasis or strong tokens 511 if self.compiled_re.match(data, pos): 512 matched = False 513 # See if the we can match an emphasis/strong pattern 514 for index, item in enumerate(self.PATTERNS): 515 # Only evaluate patterns that are after what was used on the parent 516 if index <= idx: 517 continue 518 m = item.pattern.match(data, pos) 519 if m: 520 # Append child nodes to parent 521 # Text nodes should be appended to the last 522 # child if present, and if not, it should 523 # be added as the parent's text node. 524 text = data[offset:m.start(0)] 525 if text: 526 if last is not None: 527 last.tail = text 528 else: 529 parent.text = text 530 el = self.build_element(m, item.builder, item.tags, index) 531 parent.append(el) 532 last = el 533 # Move our position past the matched hunk 534 offset = pos = m.end(0) 535 matched = True 536 if not matched: 537 # We matched nothing, move on to the next character 538 pos += 1 539 else: 540 # Increment position as no potential emphasis start was found. 541 pos += 1 542 543 # Append any leftover text as a text node. 544 text = data[offset:] 545 if text: 546 if last is not None: 547 last.tail = text 548 else: 549 parent.text = text 550 551 def build_element(self, m, builder, tags, index): 552 """Element builder.""" 553 554 if builder == 'double2': 555 return self.build_double2(m, tags, index) 556 elif builder == 'double': 557 return self.build_double(m, tags, index) 558 else: 559 return self.build_single(m, tags, index) 560 561 def handleMatch(self, m, data): 562 """Parse patterns.""" 563 564 el = None 565 start = None 566 end = None 567 568 for index, item in enumerate(self.PATTERNS): 569 m1 = item.pattern.match(data, m.start(0)) 570 if m1: 571 start = m1.start(0) 572 end = m1.end(0) 573 el = self.build_element(m1, item.builder, item.tags, index) 574 break 575 return el, start, end 576 577 578class UnderscoreProcessor(AsteriskProcessor): 579 """Emphasis processor for handling strong and em matches inside underscores.""" 580 581 PATTERNS = [ 582 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 583 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 584 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 585 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 586 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 587 ] 588 589 590class LinkInlineProcessor(InlineProcessor): 591 """ Return a link element from the given match. """ 592 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) 593 RE_TITLE_CLEAN = re.compile(r'\s') 594 595 def handleMatch(self, m, data): 596 text, index, handled = self.getText(data, m.end(0)) 597 598 if not handled: 599 return None, None, None 600 601 href, title, index, handled = self.getLink(data, index) 602 if not handled: 603 return None, None, None 604 605 el = etree.Element("a") 606 el.text = text 607 608 el.set("href", href) 609 610 if title is not None: 611 el.set("title", title) 612 613 return el, m.start(0), index 614 615 def getLink(self, data, index): 616 """Parse data between `()` of `[Text]()` allowing recursive `()`. """ 617 618 href = '' 619 title = None 620 handled = False 621 622 m = self.RE_LINK.match(data, pos=index) 623 if m and m.group(1): 624 # Matches [Text](<link> "title") 625 href = m.group(1)[1:-1].strip() 626 if m.group(2): 627 title = m.group(2)[1:-1] 628 index = m.end(0) 629 handled = True 630 elif m: 631 # Track bracket nesting and index in string 632 bracket_count = 1 633 backtrack_count = 1 634 start_index = m.end() 635 index = start_index 636 last_bracket = -1 637 638 # Primary (first found) quote tracking. 639 quote = None 640 start_quote = -1 641 exit_quote = -1 642 ignore_matches = False 643 644 # Secondary (second found) quote tracking. 645 alt_quote = None 646 start_alt_quote = -1 647 exit_alt_quote = -1 648 649 # Track last character 650 last = '' 651 652 for pos in range(index, len(data)): 653 c = data[pos] 654 if c == '(': 655 # Count nested ( 656 # Don't increment the bracket count if we are sure we're in a title. 657 if not ignore_matches: 658 bracket_count += 1 659 elif backtrack_count > 0: 660 backtrack_count -= 1 661 elif c == ')': 662 # Match nested ) to ( 663 # Don't decrement if we are sure we are in a title that is unclosed. 664 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): 665 bracket_count = 0 666 elif not ignore_matches: 667 bracket_count -= 1 668 elif backtrack_count > 0: 669 backtrack_count -= 1 670 # We've found our backup end location if the title doesn't resolve. 671 if backtrack_count == 0: 672 last_bracket = index + 1 673 674 elif c in ("'", '"'): 675 # Quote has started 676 if not quote: 677 # We'll assume we are now in a title. 678 # Brackets are quoted, so no need to match them (except for the final one). 679 ignore_matches = True 680 backtrack_count = bracket_count 681 bracket_count = 1 682 start_quote = index + 1 683 quote = c 684 # Secondary quote (in case the first doesn't resolve): [text](link'"title") 685 elif c != quote and not alt_quote: 686 start_alt_quote = index + 1 687 alt_quote = c 688 # Update primary quote match 689 elif c == quote: 690 exit_quote = index + 1 691 # Update secondary quote match 692 elif alt_quote and c == alt_quote: 693 exit_alt_quote = index + 1 694 695 index += 1 696 697 # Link is closed, so let's break out of the loop 698 if bracket_count == 0: 699 # Get the title if we closed a title string right before link closed 700 if exit_quote >= 0 and quote == last: 701 href = data[start_index:start_quote - 1] 702 title = ''.join(data[start_quote:exit_quote - 1]) 703 elif exit_alt_quote >= 0 and alt_quote == last: 704 href = data[start_index:start_alt_quote - 1] 705 title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) 706 else: 707 href = data[start_index:index - 1] 708 break 709 710 if c != ' ': 711 last = c 712 713 # We have a scenario: [test](link"notitle) 714 # When we enter a string, we stop tracking bracket resolution in the main counter, 715 # but we do keep a backup counter up until we discover where we might resolve all brackets 716 # if the title string fails to resolve. 717 if bracket_count != 0 and backtrack_count == 0: 718 href = data[start_index:last_bracket - 1] 719 index = last_bracket 720 bracket_count = 0 721 722 handled = bracket_count == 0 723 724 if title is not None: 725 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) 726 727 href = self.unescape(href).strip() 728 729 return href, title, index, handled 730 731 def getText(self, data, index): 732 """Parse the content between `[]` of the start of an image or link 733 resolving nested square brackets. 734 735 """ 736 bracket_count = 1 737 text = [] 738 for pos in range(index, len(data)): 739 c = data[pos] 740 if c == ']': 741 bracket_count -= 1 742 elif c == '[': 743 bracket_count += 1 744 index += 1 745 if bracket_count == 0: 746 break 747 text.append(c) 748 return ''.join(text), index, bracket_count == 0 749 750 751class ImageInlineProcessor(LinkInlineProcessor): 752 """ Return a img element from the given match. """ 753 754 def handleMatch(self, m, data): 755 text, index, handled = self.getText(data, m.end(0)) 756 if not handled: 757 return None, None, None 758 759 src, title, index, handled = self.getLink(data, index) 760 if not handled: 761 return None, None, None 762 763 el = etree.Element("img") 764 765 el.set("src", src) 766 767 if title is not None: 768 el.set("title", title) 769 770 el.set('alt', self.unescape(text)) 771 return el, m.start(0), index 772 773 774class ReferenceInlineProcessor(LinkInlineProcessor): 775 """ Match to a stored reference and return link element. """ 776 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) 777 778 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) 779 780 def handleMatch(self, m, data): 781 text, index, handled = self.getText(data, m.end(0)) 782 if not handled: 783 return None, None, None 784 785 id, end, handled = self.evalId(data, index, text) 786 if not handled: 787 return None, None, None 788 789 # Clean up linebreaks in id 790 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 791 if id not in self.md.references: # ignore undefined refs 792 return None, m.start(0), end 793 794 href, title = self.md.references[id] 795 796 return self.makeTag(href, title, text), m.start(0), end 797 798 def evalId(self, data, index, text): 799 """ 800 Evaluate the id portion of [ref][id]. 801 802 If [ref][] use [ref]. 803 """ 804 m = self.RE_LINK.match(data, pos=index) 805 if not m: 806 return None, index, False 807 else: 808 id = m.group(1).lower() 809 end = m.end(0) 810 if not id: 811 id = text.lower() 812 return id, end, True 813 814 def makeTag(self, href, title, text): 815 el = etree.Element('a') 816 817 el.set('href', href) 818 if title: 819 el.set('title', title) 820 821 el.text = text 822 return el 823 824 825class ShortReferenceInlineProcessor(ReferenceInlineProcessor): 826 """Short form of reference: [google]. """ 827 def evalId(self, data, index, text): 828 """Evaluate the id from of [ref] """ 829 830 return text.lower(), index, True 831 832 833class ImageReferenceInlineProcessor(ReferenceInlineProcessor): 834 """ Match to a stored reference and return img element. """ 835 def makeTag(self, href, title, text): 836 el = etree.Element("img") 837 el.set("src", href) 838 if title: 839 el.set("title", title) 840 el.set("alt", self.unescape(text)) 841 return el 842 843 844class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): 845 """ Short form of inage reference: ![ref]. """ 846 def evalId(self, data, index, text): 847 """Evaluate the id from of [ref] """ 848 849 return text.lower(), index, True 850 851 852class AutolinkInlineProcessor(InlineProcessor): 853 """ Return a link Element given an autolink (`<http://example/com>`). """ 854 def handleMatch(self, m, data): 855 el = etree.Element("a") 856 el.set('href', self.unescape(m.group(1))) 857 el.text = util.AtomicString(m.group(1)) 858 return el, m.start(0), m.end(0) 859 860 861class AutomailInlineProcessor(InlineProcessor): 862 """ 863 Return a mailto link Element given an automail link (`<foo@example.com>`). 864 """ 865 def handleMatch(self, m, data): 866 el = etree.Element('a') 867 email = self.unescape(m.group(1)) 868 if email.startswith("mailto:"): 869 email = email[len("mailto:"):] 870 871 def codepoint2name(code): 872 """Return entity definition by code, or the code if not defined.""" 873 entity = entities.codepoint2name.get(code) 874 if entity: 875 return "{}{};".format(util.AMP_SUBSTITUTE, entity) 876 else: 877 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 878 879 letters = [codepoint2name(ord(letter)) for letter in email] 880 el.text = util.AtomicString(''.join(letters)) 881 882 mailto = "mailto:" + email 883 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 884 ord(letter) for letter in mailto]) 885 el.set('href', mailto) 886 return el, m.start(0), m.end(0) 887