• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# mako/filters.py
2# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
3#
4# This module is part of Mako and is released under
5# the MIT License: http://www.opensource.org/licenses/mit-license.php
6
7
8import re
9import codecs
10
11from mako.compat import quote_plus, unquote_plus, codepoint2name, \
12        name2codepoint
13
14from mako import compat
15
16xml_escapes = {
17    '&': '&amp;',
18    '>': '&gt;',
19    '<': '&lt;',
20    '"': '&#34;',   # also &quot; in html-only
21    "'": '&#39;'    # also &apos; in html-only
22}
23
24# XXX: &quot; is valid in HTML and XML
25#      &apos; is not valid HTML, but is valid XML
26
27def legacy_html_escape(s):
28    """legacy HTML escape for non-unicode mode."""
29    s = s.replace("&", "&amp;")
30    s = s.replace(">", "&gt;")
31    s = s.replace("<", "&lt;")
32    s = s.replace('"', "&#34;")
33    s = s.replace("'", "&#39;")
34    return s
35
36
37try:
38    import markupsafe
39    html_escape = markupsafe.escape
40except ImportError:
41    html_escape = legacy_html_escape
42
43def xml_escape(string):
44    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
45
46def url_escape(string):
47    # convert into a list of octets
48    string = string.encode("utf8")
49    return quote_plus(string)
50
51def legacy_url_escape(string):
52    # convert into a list of octets
53    return quote_plus(string)
54
55def url_unescape(string):
56    text = unquote_plus(string)
57    if not is_ascii_str(text):
58        text = text.decode("utf8")
59    return text
60
61def trim(string):
62    return string.strip()
63
64
65class Decode(object):
66    def __getattr__(self, key):
67        def decode(x):
68            if isinstance(x, compat.text_type):
69                return x
70            elif not isinstance(x, compat.binary_type):
71                return decode(str(x))
72            else:
73                return compat.text_type(x, encoding=key)
74        return decode
75decode = Decode()
76
77
78_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
79
80def is_ascii_str(text):
81    return isinstance(text, str) and _ASCII_re.match(text)
82
83################################################################
84
85class XMLEntityEscaper(object):
86    def __init__(self, codepoint2name, name2codepoint):
87        self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n))
88                                      for c, n in codepoint2name.items()])
89        self.name2codepoint = name2codepoint
90
91    def escape_entities(self, text):
92        """Replace characters with their character entity references.
93
94        Only characters corresponding to a named entity are replaced.
95        """
96        return compat.text_type(text).translate(self.codepoint2entity)
97
98    def __escape(self, m):
99        codepoint = ord(m.group())
100        try:
101            return self.codepoint2entity[codepoint]
102        except (KeyError, IndexError):
103            return '&#x%X;' % codepoint
104
105
106    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
107
108    def escape(self, text):
109        """Replace characters with their character references.
110
111        Replace characters by their named entity references.
112        Non-ASCII characters, if they do not have a named entity reference,
113        are replaced by numerical character references.
114
115        The return value is guaranteed to be ASCII.
116        """
117        return self.__escapable.sub(self.__escape, compat.text_type(text)
118                                    ).encode('ascii')
119
120    # XXX: This regexp will not match all valid XML entity names__.
121    # (It punts on details involving involving CombiningChars and Extenders.)
122    #
123    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
124    __characterrefs = re.compile(r'''& (?:
125                                          \#(\d+)
126                                          | \#x([\da-f]+)
127                                          | ( (?!\d) [:\w] [-.:\w]+ )
128                                          ) ;''',
129                                 re.X | re.UNICODE)
130
131    def __unescape(self, m):
132        dval, hval, name = m.groups()
133        if dval:
134            codepoint = int(dval)
135        elif hval:
136            codepoint = int(hval, 16)
137        else:
138            codepoint = self.name2codepoint.get(name, 0xfffd)
139            # U+FFFD = "REPLACEMENT CHARACTER"
140        if codepoint < 128:
141            return chr(codepoint)
142        return chr(codepoint)
143
144    def unescape(self, text):
145        """Unescape character references.
146
147        All character references (both entity references and numerical
148        character references) are unescaped.
149        """
150        return self.__characterrefs.sub(self.__unescape, text)
151
152
153_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)
154
155html_entities_escape = _html_entities_escaper.escape_entities
156html_entities_unescape = _html_entities_escaper.unescape
157
158
159def htmlentityreplace_errors(ex):
160    """An encoding error handler.
161
162    This python `codecs`_ error handler replaces unencodable
163    characters with HTML entities, or, if no HTML entity exists for
164    the character, XML character references.
165
166    >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
167    'The cost was &euro;12.'
168    """
169    if isinstance(ex, UnicodeEncodeError):
170        # Handle encoding errors
171        bad_text = ex.object[ex.start:ex.end]
172        text = _html_entities_escaper.escape(bad_text)
173        return (compat.text_type(text), ex.end)
174    raise ex
175
176codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
177
178
179# TODO: options to make this dynamic per-compilation will be added in a later
180# release
181DEFAULT_ESCAPES = {
182    'x': 'filters.xml_escape',
183    'h': 'filters.html_escape',
184    'u': 'filters.url_escape',
185    'trim': 'filters.trim',
186    'entity': 'filters.html_entities_escape',
187    'unicode': 'unicode',
188    'decode': 'decode',
189    'str': 'str',
190    'n': 'n'
191}
192
193if compat.py3k:
194    DEFAULT_ESCAPES.update({
195        'unicode': 'str'
196    })
197
198NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
199NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'
200NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape'
201
202