1# mako/filters.py 2# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> 3# 4# This module is part of Mako and is released under 5# the MIT License: http://www.opensource.org/licenses/mit-license.php 6 7 8import re 9import codecs 10 11from mako.compat import quote_plus, unquote_plus, codepoint2name, \ 12 name2codepoint 13 14from mako import compat 15 16xml_escapes = { 17 '&': '&', 18 '>': '>', 19 '<': '<', 20 '"': '"', # also " in html-only 21 "'": ''' # also ' in html-only 22} 23 24# XXX: " is valid in HTML and XML 25# ' is not valid HTML, but is valid XML 26 27def legacy_html_escape(s): 28 """legacy HTML escape for non-unicode mode.""" 29 s = s.replace("&", "&") 30 s = s.replace(">", ">") 31 s = s.replace("<", "<") 32 s = s.replace('"', """) 33 s = s.replace("'", "'") 34 return s 35 36 37try: 38 import markupsafe 39 html_escape = markupsafe.escape 40except ImportError: 41 html_escape = legacy_html_escape 42 43def xml_escape(string): 44 return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string) 45 46def url_escape(string): 47 # convert into a list of octets 48 string = string.encode("utf8") 49 return quote_plus(string) 50 51def legacy_url_escape(string): 52 # convert into a list of octets 53 return quote_plus(string) 54 55def url_unescape(string): 56 text = unquote_plus(string) 57 if not is_ascii_str(text): 58 text = text.decode("utf8") 59 return text 60 61def trim(string): 62 return string.strip() 63 64 65class Decode(object): 66 def __getattr__(self, key): 67 def decode(x): 68 if isinstance(x, compat.text_type): 69 return x 70 elif not isinstance(x, compat.binary_type): 71 return decode(str(x)) 72 else: 73 return compat.text_type(x, encoding=key) 74 return decode 75decode = Decode() 76 77 78_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z') 79 80def is_ascii_str(text): 81 return isinstance(text, str) and _ASCII_re.match(text) 82 83################################################################ 84 85class XMLEntityEscaper(object): 86 def __init__(self, codepoint2name, name2codepoint): 87 self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n)) 88 for c, n in codepoint2name.items()]) 89 self.name2codepoint = name2codepoint 90 91 def escape_entities(self, text): 92 """Replace characters with their character entity references. 93 94 Only characters corresponding to a named entity are replaced. 95 """ 96 return compat.text_type(text).translate(self.codepoint2entity) 97 98 def __escape(self, m): 99 codepoint = ord(m.group()) 100 try: 101 return self.codepoint2entity[codepoint] 102 except (KeyError, IndexError): 103 return '&#x%X;' % codepoint 104 105 106 __escapable = re.compile(r'["&<>]|[^\x00-\x7f]') 107 108 def escape(self, text): 109 """Replace characters with their character references. 110 111 Replace characters by their named entity references. 112 Non-ASCII characters, if they do not have a named entity reference, 113 are replaced by numerical character references. 114 115 The return value is guaranteed to be ASCII. 116 """ 117 return self.__escapable.sub(self.__escape, compat.text_type(text) 118 ).encode('ascii') 119 120 # XXX: This regexp will not match all valid XML entity names__. 121 # (It punts on details involving involving CombiningChars and Extenders.) 122 # 123 # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef 124 __characterrefs = re.compile(r'''& (?: 125 \#(\d+) 126 | \#x([\da-f]+) 127 | ( (?!\d) [:\w] [-.:\w]+ ) 128 ) ;''', 129 re.X | re.UNICODE) 130 131 def __unescape(self, m): 132 dval, hval, name = m.groups() 133 if dval: 134 codepoint = int(dval) 135 elif hval: 136 codepoint = int(hval, 16) 137 else: 138 codepoint = self.name2codepoint.get(name, 0xfffd) 139 # U+FFFD = "REPLACEMENT CHARACTER" 140 if codepoint < 128: 141 return chr(codepoint) 142 return chr(codepoint) 143 144 def unescape(self, text): 145 """Unescape character references. 146 147 All character references (both entity references and numerical 148 character references) are unescaped. 149 """ 150 return self.__characterrefs.sub(self.__unescape, text) 151 152 153_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint) 154 155html_entities_escape = _html_entities_escaper.escape_entities 156html_entities_unescape = _html_entities_escaper.unescape 157 158 159def htmlentityreplace_errors(ex): 160 """An encoding error handler. 161 162 This python `codecs`_ error handler replaces unencodable 163 characters with HTML entities, or, if no HTML entity exists for 164 the character, XML character references. 165 166 >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace') 167 'The cost was €12.' 168 """ 169 if isinstance(ex, UnicodeEncodeError): 170 # Handle encoding errors 171 bad_text = ex.object[ex.start:ex.end] 172 text = _html_entities_escaper.escape(bad_text) 173 return (compat.text_type(text), ex.end) 174 raise ex 175 176codecs.register_error('htmlentityreplace', htmlentityreplace_errors) 177 178 179# TODO: options to make this dynamic per-compilation will be added in a later 180# release 181DEFAULT_ESCAPES = { 182 'x': 'filters.xml_escape', 183 'h': 'filters.html_escape', 184 'u': 'filters.url_escape', 185 'trim': 'filters.trim', 186 'entity': 'filters.html_entities_escape', 187 'unicode': 'unicode', 188 'decode': 'decode', 189 'str': 'str', 190 'n': 'n' 191} 192 193if compat.py3k: 194 DEFAULT_ESCAPES.update({ 195 'unicode': 'str' 196 }) 197 198NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy() 199NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape' 200NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape' 201 202