• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import absolute_import, division, unicode_literals
2
3try:
4    import json
5except ImportError:
6    import simplejson as json
7
8from html5lib import html5parser, sanitizer, constants, treebuilders
9
10
11def toxmlFactory():
12    tree = treebuilders.getTreeBuilder("etree")
13
14    def toxml(element):
15        # encode/decode roundtrip required for Python 2.6 compatibility
16        result_bytes = tree.implementation.tostring(element, encoding="utf-8")
17        return result_bytes.decode("utf-8")
18
19    return toxml
20
21
22def runSanitizerTest(name, expected, input, toxml=None):
23    if toxml is None:
24        toxml = toxmlFactory()
25    expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
26                        parseFragment(expected)])
27    expected = json.loads(json.dumps(expected))
28    assert expected == sanitize_html(input)
29
30
31def sanitize_html(stream, toxml=None):
32    if toxml is None:
33        toxml = toxmlFactory()
34    return ''.join([toxml(token) for token in
35                    html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
36                    parseFragment(stream)])
37
38
39def test_should_handle_astral_plane_characters():
40    assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
41
42
43def test_should_allow_relative_uris():
44    assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>')
45
46
47def test_sanitizer():
48    toxml = toxmlFactory()
49    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
50        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
51            continue  # TODO
52        if tag_name != tag_name.lower():
53            continue  # TODO
54        if tag_name == 'image':
55            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
56                   "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
57                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
58                   toxml)
59        elif tag_name == 'br':
60            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
61                   "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
62                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
63                   toxml)
64        elif tag_name in constants.voidElements:
65            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
66                   "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
67                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
68                   toxml)
69        else:
70            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
71                   "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
72                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
73                   toxml)
74
75    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
76        tag_name = tag_name.upper()
77        yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
78               "&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
79               "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
80               toxml)
81
82    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
83        if attribute_name != attribute_name.lower():
84            continue  # TODO
85        if attribute_name == 'style':
86            continue
87        attribute_value = 'foo'
88        if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
89            attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
90        yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
91               "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
92               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
93               toxml)
94
95    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
96        attribute_name = attribute_name.upper()
97        yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
98               "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
99               "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
100               toxml)
101
102    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
103        rest_of_uri = '//sub.domain.tld/path/object.ext'
104        if protocol == 'data':
105            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
106        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
107               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
108               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
109               toxml)
110
111    yield (runSanitizerTest, "test_invalid_data_uri",
112           "<audio controls=\"\"></audio>",
113           "<audio controls=\"\" src=\"data:foobar\"></audio>",
114           toxml)
115
116    yield (runSanitizerTest, "test_data_uri_disallowed_type",
117           "<audio controls=\"\"></audio>",
118           "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>",
119           toxml)
120
121    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
122        rest_of_uri = '//sub.domain.tld/path/object.ext'
123        if protocol == 'data':
124            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
125        protocol = protocol.upper()
126        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
127               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
128               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
129               toxml)
130