1""" 2General functions for HTML manipulation. 3""" 4 5import re as _re 6from html.entities import html5 as _html5 7 8 9__all__ = ['escape', 'unescape'] 10 11 12def escape(s, quote=True): 13 """ 14 Replace special characters "&", "<" and ">" to HTML-safe sequences. 15 If the optional flag quote is true (the default), the quotation mark 16 characters, both double quote (") and single quote (') characters are also 17 translated. 18 """ 19 s = s.replace("&", "&") # Must be done first! 20 s = s.replace("<", "<") 21 s = s.replace(">", ">") 22 if quote: 23 s = s.replace('"', """) 24 s = s.replace('\'', "'") 25 return s 26 27 28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references 29 30_invalid_charrefs = { 31 0x00: '\ufffd', # REPLACEMENT CHARACTER 32 0x0d: '\r', # CARRIAGE RETURN 33 0x80: '\u20ac', # EURO SIGN 34 0x81: '\x81', # <control> 35 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK 36 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK 37 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK 38 0x85: '\u2026', # HORIZONTAL ELLIPSIS 39 0x86: '\u2020', # DAGGER 40 0x87: '\u2021', # DOUBLE DAGGER 41 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT 42 0x89: '\u2030', # PER MILLE SIGN 43 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON 44 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 45 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE 46 0x8d: '\x8d', # <control> 47 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON 48 0x8f: '\x8f', # <control> 49 0x90: '\x90', # <control> 50 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK 51 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK 52 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK 53 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK 54 0x95: '\u2022', # BULLET 55 0x96: '\u2013', # EN DASH 56 0x97: '\u2014', # EM DASH 57 0x98: '\u02dc', # SMALL TILDE 58 0x99: '\u2122', # TRADE MARK SIGN 59 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON 60 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 61 0x9c: '\u0153', # LATIN SMALL LIGATURE OE 62 0x9d: '\x9d', # <control> 63 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON 64 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS 65} 66 67_invalid_codepoints = { 68 # 0x0001 to 0x0008 69 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 70 # 0x000E to 0x001F 71 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 72 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 73 # 0x007F to 0x009F 74 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 75 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 76 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 77 # 0xFDD0 to 0xFDEF 78 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, 79 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, 80 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea, 81 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, 82 # others 83 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 84 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 85 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff, 86 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff, 87 0x10fffe, 0x10ffff 88} 89 90 91def _replace_charref(s): 92 s = s.group(1) 93 if s[0] == '#': 94 # numeric charref 95 if s[1] in 'xX': 96 num = int(s[2:].rstrip(';'), 16) 97 else: 98 num = int(s[1:].rstrip(';')) 99 if num in _invalid_charrefs: 100 return _invalid_charrefs[num] 101 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: 102 return '\uFFFD' 103 if num in _invalid_codepoints: 104 return '' 105 return chr(num) 106 else: 107 # named charref 108 if s in _html5: 109 return _html5[s] 110 # find the longest matching name (as defined by the standard) 111 for x in range(len(s)-1, 1, -1): 112 if s[:x] in _html5: 113 return _html5[s[:x]] + s[x:] 114 else: 115 return '&' + s 116 117 118_charref = _re.compile(r'&(#[0-9]+;?' 119 r'|#[xX][0-9a-fA-F]+;?' 120 r'|[^\t\n\f <&#;]{1,32};?)') 121 122def unescape(s): 123 """ 124 Convert all named and numeric character references (e.g. >, >, 125 &x3e;) in the string s to the corresponding unicode characters. 126 This function uses the rules defined by the HTML 5 standard 127 for both valid and invalid character references, and the list of 128 HTML 5 named character references defined in html.entities.html5. 129 """ 130 if '&' not in s: 131 return s 132 return _charref.sub(_replace_charref, s) 133