from __future__ import absolute_import, division, unicode_literals
try:
import json
except ImportError:
import simplejson as json
from html5lib import html5parser, sanitizer, constants, treebuilders
def toxmlFactory():
tree = treebuilders.getTreeBuilder("etree")
def toxml(element):
# encode/decode roundtrip required for Python 2.6 compatibility
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
return result_bytes.decode("utf-8")
return toxml
def runSanitizerTest(name, expected, input, toxml=None):
if toxml is None:
toxml = toxmlFactory()
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
parseFragment(expected)])
expected = json.loads(json.dumps(expected))
assert expected == sanitize_html(input)
def sanitize_html(stream, toxml=None):
if toxml is None:
toxml = toxmlFactory()
return ''.join([toxml(token) for token in
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
parseFragment(stream)])
def test_should_handle_astral_plane_characters():
assert '\U0001d4b5 \U0001d538' == sanitize_html("
𝒵 𝔸
")
def test_should_allow_relative_uris():
assert '' == sanitize_html('
')
def test_sanitizer():
toxml = toxmlFactory()
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
continue # TODO
if tag_name != tag_name.lower():
continue # TODO
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"
foo <bad>bar</bad> baz",
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name),
toxml)
elif tag_name == 'br':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"
foo <bad>bar</bad> baz
",
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name),
toxml)
elif tag_name in constants.voidElements:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name,
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name),
toxml)
else:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo <bad>bar</bad> baz%s>" % (tag_name, tag_name),
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name),
toxml)
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
tag_name = tag_name.upper()
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
"<%s title='1'>foo bar baz%s>" % (tag_name, tag_name),
toxml)
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
if attribute_name != attribute_name.lower():
continue # TODO
if attribute_name == 'style':
continue
attribute_value = 'foo'
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
"foo <bad>bar</bad> baz
" % (attribute_name, attribute_value),
"foo bar baz
" % (attribute_name, attribute_value),
toxml)
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
attribute_name = attribute_name.upper()
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
"foo <bad>bar</bad> baz
",
"foo bar baz
" % attribute_name,
toxml)
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"
foo" % (protocol, rest_of_uri),
"""
foo""" % (protocol, rest_of_uri),
toxml)
yield (runSanitizerTest, "test_invalid_data_uri",
"",
"",
toxml)
yield (runSanitizerTest, "test_data_uri_disallowed_type",
"",
"",
toxml)
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"
foo" % (protocol, rest_of_uri),
"""
foo""" % (protocol, rest_of_uri),
toxml)