1from __future__ import absolute_import, division, unicode_literals 2 3import json 4import unittest 5 6from .support import get_data_files 7 8try: 9 unittest.TestCase.assertEqual 10except AttributeError: 11 unittest.TestCase.assertEqual = unittest.TestCase.assertEquals 12 13import html5lib 14from html5lib import constants 15from html5lib.serializer import HTMLSerializer, serialize 16from html5lib.treewalkers._base import TreeWalker 17 18optionals_loaded = [] 19 20try: 21 from lxml import etree 22 optionals_loaded.append("lxml") 23except ImportError: 24 pass 25 26default_namespace = constants.namespaces["html"] 27 28 29class JsonWalker(TreeWalker): 30 def __iter__(self): 31 for token in self.tree: 32 type = token[0] 33 if type == "StartTag": 34 if len(token) == 4: 35 namespace, name, attrib = token[1:4] 36 else: 37 namespace = default_namespace 38 name, attrib = token[1:3] 39 yield self.startTag(namespace, name, self._convertAttrib(attrib)) 40 elif type == "EndTag": 41 if len(token) == 3: 42 namespace, name = token[1:3] 43 else: 44 namespace = default_namespace 45 name = token[1] 46 yield self.endTag(namespace, name) 47 elif type == "EmptyTag": 48 if len(token) == 4: 49 namespace, name, attrib = token[1:] 50 else: 51 namespace = default_namespace 52 name, attrib = token[1:] 53 for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)): 54 yield token 55 elif type == "Comment": 56 yield self.comment(token[1]) 57 elif type in ("Characters", "SpaceCharacters"): 58 for token in self.text(token[1]): 59 yield token 60 elif type == "Doctype": 61 if len(token) == 4: 62 yield self.doctype(token[1], token[2], token[3]) 63 elif len(token) == 3: 64 yield self.doctype(token[1], token[2]) 65 else: 66 yield self.doctype(token[1]) 67 else: 68 raise ValueError("Unknown token type: " + type) 69 70 def _convertAttrib(self, attribs): 71 """html5lib tree-walkers use a dict of (namespace, name): value for 72 attributes, but JSON cannot represent this. Convert from the format 73 in the serializer tests (a list of dicts with "namespace", "name", 74 and "value" as keys) to html5lib's tree-walker format.""" 75 attrs = {} 76 for attrib in attribs: 77 name = (attrib["namespace"], attrib["name"]) 78 assert(name not in attrs) 79 attrs[name] = attrib["value"] 80 return attrs 81 82 83def serialize_html(input, options): 84 options = dict([(str(k), v) for k, v in options.items()]) 85 stream = JsonWalker(input) 86 serializer = HTMLSerializer(alphabetical_attributes=True, **options) 87 return serializer.render(stream, options.get("encoding", None)) 88 89 90def runSerializerTest(input, expected, options): 91 encoding = options.get("encoding", None) 92 93 if encoding: 94 encode = lambda x: x.encode(encoding) 95 expected = list(map(encode, expected)) 96 97 result = serialize_html(input, options) 98 if len(expected) == 1: 99 assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options)) 100 elif result not in expected: 101 assert False, "Expected: %s, Received: %s" % (expected, result) 102 103 104class EncodingTestCase(unittest.TestCase): 105 def throwsWithLatin1(self, input): 106 self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"}) 107 108 def testDoctypeName(self): 109 self.throwsWithLatin1([["Doctype", "\u0101"]]) 110 111 def testDoctypePublicId(self): 112 self.throwsWithLatin1([["Doctype", "potato", "\u0101"]]) 113 114 def testDoctypeSystemId(self): 115 self.throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]]) 116 117 def testCdataCharacters(self): 118 runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]], 119 ["<style>ā"], {"encoding": "iso-8859-1"}) 120 121 def testCharacters(self): 122 runSerializerTest([["Characters", "\u0101"]], 123 ["ā"], {"encoding": "iso-8859-1"}) 124 125 def testStartTagName(self): 126 self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]]) 127 128 def testEmptyTagName(self): 129 self.throwsWithLatin1([["EmptyTag", "http://www.w3.org/1999/xhtml", "\u0101", []]]) 130 131 def testAttributeName(self): 132 self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]]) 133 134 def testAttributeValue(self): 135 runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span", 136 [{"namespace": None, "name": "potato", "value": "\u0101"}]]], 137 ["<span potato=ā>"], {"encoding": "iso-8859-1"}) 138 139 def testEndTagName(self): 140 self.throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]]) 141 142 def testComment(self): 143 self.throwsWithLatin1([["Comment", "\u0101"]]) 144 145 146if "lxml" in optionals_loaded: 147 class LxmlTestCase(unittest.TestCase): 148 def setUp(self): 149 self.parser = etree.XMLParser(resolve_entities=False) 150 self.treewalker = html5lib.getTreeWalker("lxml") 151 self.serializer = HTMLSerializer() 152 153 def testEntityReplacement(self): 154 doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>""" 155 tree = etree.fromstring(doc, parser=self.parser).getroottree() 156 result = serialize(tree, tree="lxml", omit_optional_tags=False) 157 self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result) 158 159 def testEntityXML(self): 160 doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>""" 161 tree = etree.fromstring(doc, parser=self.parser).getroottree() 162 result = serialize(tree, tree="lxml", omit_optional_tags=False) 163 self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>""", result) 164 165 def testEntityNoResolve(self): 166 doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>""" 167 tree = etree.fromstring(doc, parser=self.parser).getroottree() 168 result = serialize(tree, tree="lxml", omit_optional_tags=False, 169 resolve_entities=False) 170 self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>""", result) 171 172 173def test_serializer(): 174 for filename in get_data_files('serializer', '*.test'): 175 with open(filename) as fp: 176 tests = json.load(fp) 177 for index, test in enumerate(tests['tests']): 178 yield runSerializerTest, test["input"], test["expected"], test.get("options", {}) 179